/** * Return the confidence with with this algorithm can load the file * @param line the line to parse * @param cols The vector to parse into * @param expectedCommas The number of expected commas in the line * @param isOldTBL boolean to deal with new and old TBL formats. * @returns An integer specifying how many columns were parsed into. * @throws std::length_error if anything other than 17 columns (or 16 * cell-delimiting commas) is found when loading an old Refl TBL. A * length_error will be thrown for new TBL formats if there are less column * headings than expected commas. */ size_t LoadTBL::getCells(std::string line, std::vector<std::string> &cols, size_t expectedCommas, bool isOldTBL) const { // first check the number of commas in the line. size_t found = countCommas(line); if (isOldTBL) { if (found == expectedCommas) { // If there are 16 that simplifies things and i can get boost to do the // hard // work boost::split(cols, line, boost::is_any_of(","), boost::token_compress_off); } else if (found < expectedCommas) { // less than 16 means the line isn't properly formatted. So Throw std::string message = "A line must contain " + std::to_string(expectedCommas) + " cell-delimiting commas. Found " + std::to_string(found) + "."; throw std::length_error(message); } else { // More than 16 will need further checks as more is only ok when pairs of // quotes surround a comma, meaning it isn't a delimiter std::vector<std::vector<size_t>> quoteBounds; findQuotePairs(line, quoteBounds); // if we didn't find any quotes, then there are too many commas and we // definitely have too many delimiters if (quoteBounds.empty()) { std::string message = "A line must contain " + std::to_string(expectedCommas) + " cell-delimiting commas. Found " + std::to_string(found) + "."; throw std::length_error(message); } // now go through and split it up manually. Throw if we find ourselves in // a // positon where we'd add a 18th value to the vector csvParse(line, cols, quoteBounds, expectedCommas); } } else { std::vector<std::vector<size_t>> quoteBounds; findQuotePairs(line, quoteBounds); csvParse(line, cols, quoteBounds, expectedCommas); if (cols.size() > expectedCommas) { for (size_t i = expectedCommas + 1; i < cols.size(); i++) { cols[expectedCommas].append( boost::lexical_cast<std::string>("," + cols[i])); } } else if (cols.size() < expectedCommas) { std::string message = "A line must contain " + std::to_string(expectedCommas) + " cell-delimiting commas. Found " + std::to_string(found) + "."; throw std::length_error(message); } } return cols.size(); }
struct slName *tagFindValList(struct tagStanza *stanza, char *tag) /* Read in tag as a list. Do a slFreeList on this when done. * Returns NULL if no value */ { char *val = tagFindVal(stanza, tag); return csvParse(val); }
struct slName *tagMustFindValList(struct tagStanza *stanza, char *tag) /* Find tag or die trying, and return it as parsed out list */ { char *val = tagMustFindVal(stanza, tag); return csvParse(val); }
void rWriteJson(FILE *f, struct tagStorm *storm, struct tagStanza *stanza, struct ttjSubObj *obj, struct ttjSubObj *labeledObj, struct hash *schemaHash, struct dyString *scratch) /* Write out json object recursively */ { boolean isArray = allDigitNames(obj->children); struct ttjSubObj *field; if (isArray) { fprintf(f, "["); for (field = obj->children; field != NULL; field = field->next) { if (field != obj->children) // Only write comma separators after the first one fprintf(f, ","); rWriteJson(f, storm, stanza, field, labeledObj, schemaHash, scratch); } fprintf(f, "]"); } else { fprintf(f, "{"); boolean firstOut = TRUE; /* Figure out if we need to attach a core object and do so. The figuring bit is * frankly clunky. */ char *objType = labeledObj->name; if (sameString(objType, "submitter") || sameString(objType, "contributors")) objType = "contact"; else if (sameString(objType, "publications")) objType = "publication"; else if (sameString(objType, "protocol")) // protocol is actually just protocol_id objType = "string"; else if (sameString(objType, "protocols")) // but protocols array is protocol objType = "protocol"; else if (sameString(objType, "umi_barcode")) objType = "barcode"; if (objNeedsCore(objType)) printCore(f, objType, &firstOut); for (field = obj->children; field != NULL; field = field->next) { char *fieldName = field->name; if (field->children != NULL) { /* Look for funny characteristics_ as these are largely up to user. */ if (startsWith("characteristics_", field->name)) errAbort("No '.' allowed in field name after characteristics_ in %s", field->children->fullName); /* If actually have data in this stanza write our field. */ if (prefixDotInStanza(field->fullName, stanza, scratch)) { writeJsonTag(f, fieldName, &firstOut); rWriteJson(f, storm, stanza, field, field, schemaHash, scratch); } } else { char *val = tagFindVal(stanza, field->fullName); if (val != NULL) { boolean isNum = FALSE; char *schemaName = tagSchemaFigureArrayName(field->fullName, scratch); struct tagSchema *schema = hashFindVal(schemaHash, schemaName); if (schema != NULL) isNum = (schema->type == '#' || schema->type == '%'); if (sameString(fieldName, "files")) { writeJsonTag(f, "lanes", &firstOut); writeLaneArray(f, stanza, val); } else { boolean isArray = FALSE; writeJsonTag(f, fieldName, &firstOut); if (schema != NULL) isArray = schema->isArray; struct slName *list = csvParse(val); if (isArray) fputc('[', f); else { if (list->next != NULL) // more than one element errAbort("Multiple vals for scalar tag %s in stanza starting line %d of %s", field->fullName, stanza->startLineIx, storm->fileName); } struct slName *el; for (el = list; el != NULL; el = el->next) { writeJsonVal(f, el->name, isNum); if (el->next != NULL) fputc(',', f); } if (isArray) fputc(']', f); slFreeList(&list); } } } } fprintf(f, "}"); } }
void writeLaneArray(FILE *f, struct tagStanza *stanza, char *csvList) /* Write out an array of file objects base on file names in csvList */ { struct slName *list = csvParse(csvList), *file; /* Figure out number of files per lanes. We'll take the lane number for the * file names if available, but if not we'll assume list is sorted and will * put the appropriate number of files in each lane. */ int laneCounter = 1; int filesPerLane = 1; int curFileInLane = 0; char *pairedEnds = tagMustFindVal(stanza, "assay.seq.paired_ends"); if (!sameString(pairedEnds, "no")) filesPerLane = 2; /* First pass, make a list of lanes */ struct laneFiles *laneList = NULL, *lane; for (file = list; file != NULL; file = file->next) { /* Figure out laneIx, from file name if possible, otherwise by counting */ char *fileName = file->name; int laneIx = laneFromFileName(fileName); if (laneIx == 0) laneIx = laneCounter; /* Update laneCounter */ if (++curFileInLane >= filesPerLane) { ++laneCounter; curFileInLane = 0; } /* Find lane in laneList, make new lane if it's not there. */ lane = laneFilesFind(laneList, laneIx); if (lane == NULL) { AllocVar(lane); lane->laneIx = laneIx; slAddHead(&laneList, lane); } slNameAddTail(&lane->fileList, fileName); } slReverse(&laneList); slSort(&laneList, laneFilesCmp); /* Now make a lane array and go through lane list */ boolean firstOut = TRUE; fputc('[', f); for (lane = laneList; lane != NULL; lane = lane->next) { /* Write comma between lane objects */ if (firstOut) firstOut = FALSE; else fputc(',', f); /* Write lane object starting with lane index*/ fputc('{', f); fprintf(f, "\"%s\": %d", "number", lane->laneIx); /* Rest of lane fields are based on files we contain */ for (file = lane->fileList; file != NULL; file = file->next) { /* Calculate type */ char *fileName = file->name; char *type = NULL; if (sameString(pairedEnds, "no")) { type = "r1"; } else if (sameString(pairedEnds, "yes")) { int end = endFromFileName(fileName); if (end == 1) type = "r1"; else type = "r2"; } else errAbort("Unrecognized paired_ends %s", pairedEnds); fprintf(f, ",\"%s\":", type); writeJsonVal(f, fileName, FALSE); } fputc('}', f); } fputc(']', f); slFreeList(&list); }