void edwCorrectFileTags(char *tabFileName) /* edwCorrectFileTags - Use this to correct tags in the edwFile table and corresponding fields * in the edwValidFile table without forcing a validateManifest rerun or a reupload.. */ { struct sqlConnection *conn = edwConnectReadWrite(); char *requiredFields[] = {"accession",}; char *forbiddenFields[] = {"md5_sum", "size", "valid_key", "file_name"}; struct fieldedTable *table = fieldedTableFromTabFile(tabFileName, tabFileName, requiredFields, ArraySize(requiredFields)); checkForbiddenFields(table, forbiddenFields, ArraySize(forbiddenFields)); int accessionIx = stringArrayIx("accession", table->fields, table->fieldCount); struct fieldedRow *fr; for (fr = table->rowList; fr != NULL; fr = fr->next) { char *acc = fr->row[accessionIx]; long long id = edwNeedFileIdForLicensePlate(conn, acc); struct edwFile *ef = edwFileFromId(conn, id); int i; char *tags = ef->tags; for (i=0; i<table->fieldCount; ++i) { if (i != accessionIx) tags = cgiStringNewValForVar(tags, table->fields[i], fr->row[i]); } edwFileResetTags(conn, ef, tags); edwFileFree(&ef); } }
void addSdrfToStormTop(char *sdrfFile, struct tagStorm *storm) /* Add lines of sdrfFile as children of first top level stanza in storm. */ { struct fieldedTable *table = fieldedTableFromTabFile(sdrfFile, sdrfFile, NULL, 0 ); /* Convert ArrayExpress field names to our field names */ int fieldIx; char *lastNonTerm = NULL; char *lastNonUnit = NULL; for (fieldIx=0; fieldIx < table->fieldCount; fieldIx += 1) { char tagName[256]; aeFieldToNormalField("sdrf.", table->fields[fieldIx], tagName, sizeof(tagName)); if (lastNonTerm != NULL && sameString("sdrf.Term_Source_REF", tagName)) { safef(tagName, sizeof(tagName), "%s_Term_Source_REF", lastNonTerm); table->fields[fieldIx] = lmCloneString(table->lm, tagName); } else if (lastNonTerm != NULL && sameString("sdrf.Term_Accession_Number", tagName)) { safef(tagName, sizeof(tagName), "%s_Term_Accession_Number", lastNonTerm); table->fields[fieldIx] = lmCloneString(table->lm, tagName); } else if (lastNonUnit != NULL && startsWith("sdrf.Unit_", tagName)) { safef(tagName, sizeof(tagName), "%s_Unit", lastNonUnit); lastNonTerm = lmCloneString(table->lm, tagName); table->fields[fieldIx] = lastNonTerm; } else { lastNonTerm = lastNonUnit = lmCloneString(table->lm, tagName); table->fields[fieldIx] = lastNonTerm; } } /* Make up fastq field indexes to handle processing of paired reads in fastq, which * take two lines of sdrf file. */ char *fieldsWithFastqs[] = /* Fields that contain the fastq file names */ { "sdrf.Comment_FASTQ_URI", "sdrf.Comment_SUBMITTED_FILE_NAME", "sdrf.Scan_Name", }; boolean mightReuseStanza = TRUE; bool *reuseMultiFields; // If set this field can vary and line still reused AllocArray(reuseMultiFields, table->fieldCount); int i; for (i=0; i<ArraySize(fieldsWithFastqs); ++i) { char *field = fieldsWithFastqs[i]; int ix = stringArrayIx(field, table->fields, table->fieldCount); if (ix >=0) reuseMultiFields[ix] = TRUE; else if (i == 0) { mightReuseStanza = FALSE; break; // Make sure has first one if going to do paired read fastq processing } } /* Make up a list and hash of fieldMergers to handle conversion of columns that occur * multiple times to a comma-separated list of values in a single column. */ struct fieldMerger /* Something to help merge multiple columns with same name */ { struct fieldMerger *next; /* Next in list */ char *name; struct dyString *val; /* Comma separated value */ }; struct hash *fieldHash = hashNew(0); struct fieldMerger *fmList = NULL; for (fieldIx = 0; fieldIx < table->fieldCount; ++fieldIx) { char *fieldName = table->fields[fieldIx]; if (hashLookup(fieldHash, fieldName) == NULL) { struct fieldMerger *fm; AllocVar(fm); fm->name = fieldName; fm->val = dyStringNew(0); slAddTail(&fmList, fm); hashAdd(fieldHash, fieldName, fm); } } /* Grab top level stanza and make sure there is only one. */ struct tagStanza *topStanza = storm->forest; if (topStanza == NULL || topStanza->next != NULL) internalErr(); /* Scan through table, making new stanzas for each row and hooking them into topStanza */ struct fieldedRow *fr, *lastFr = NULL; struct tagStanza *stanza = NULL; for (fr = table->rowList; fr != NULL; fr = fr->next) { /* Empty out any existing vals */ struct fieldMerger *fm; for (fm = fmList; fm != NULL; fm = fm->next) dyStringClear(fm->val); /* Add all non-empty values from this row to our fieldMergers. */ char **row = fr->row; for (fieldIx = 0; fieldIx < table->fieldCount; ++fieldIx) { char *fieldName = table->fields[fieldIx]; fm = hashMustFindVal(fieldHash, fieldName); char *val = row[fieldIx]; if (!isEmpty(val)) csvEscapeAndAppend(fm->val, val); } /* If only the reuseMultiFields are varying, append to those values in previous stanza, * otherwise make a new stanza */ if (mightReuseStanza && lastFr != NULL && sameExceptForSome(lastFr->row, fr->row, table->fieldCount, reuseMultiFields)) { int i; for (i=0; i<ArraySize(fieldsWithFastqs); ++i) { char *fieldName = fieldsWithFastqs[i]; if ((fm = hashFindVal(fieldHash, fieldName)) != NULL) { char *newVal = fm->val->string; char *oldVal = tagMustFindVal(stanza, fieldName); int bothSize = strlen(newVal) + strlen(oldVal) + 1 + 1; char bothBuf[bothSize]; safef(bothBuf, bothSize, "%s,%s", oldVal, newVal); tagStanzaUpdateTag(storm, stanza, fieldName, bothBuf); } } } else { /* Output all nonempty vals to stanza */ stanza = tagStanzaNew(storm, topStanza); for (fm = fmList; fm != NULL; fm = fm->next) if (fm->val->stringSize > 0) tagStanzaAppend(storm, stanza, fm->name, fm->val->string); } lastFr = fr; } slReverse(&topStanza->children); }
void edwParseSubmitFile(struct sqlConnection *conn, char *submitLocalPath, char *submitUrl, struct submitFileRow **retSubmitList) /* Load and parse up this file as fielded table, make sure all required fields are there, * and calculate indexes of required fields. This produces an edwFile list, but with * still quite a few fields missing - just what can be filled in from submit filled in. * The submitUrl is just used for error reporting. If it's local, just make it the * same as submitLocalPath. */ { char *requiredFields[] = {"file_name", "format", "output_type", "experiment", "replicate", "enriched_in", "md5_sum", "size", "modified", "valid_key"}; struct fieldedTable *table = fieldedTableFromTabFile(submitLocalPath, submitUrl, requiredFields, ArraySize(requiredFields)); /* Get offsets of all required fields */ int fileIx = stringArrayIx("file_name", table->fields, table->fieldCount); int formatIx = stringArrayIx("format", table->fields, table->fieldCount); int outputIx = stringArrayIx("output_type", table->fields, table->fieldCount); int experimentIx = stringArrayIx("experiment", table->fields, table->fieldCount); int replicateIx = stringArrayIx("replicate", table->fields, table->fieldCount); int enrichedIx = stringArrayIx("enriched_in", table->fields, table->fieldCount); int md5Ix = stringArrayIx("md5_sum", table->fields, table->fieldCount); int sizeIx = stringArrayIx("size", table->fields, table->fieldCount); int modifiedIx = stringArrayIx("modified", table->fields, table->fieldCount); int validIx = stringArrayIx("valid_key", table->fields, table->fieldCount); /* See if we're doing replacement and check have all columns needed if so. */ int replacesIx = stringArrayIx(replacesTag, table->fields, table->fieldCount); int replaceReasonIx = stringArrayIx(replaceReasonTag, table->fields, table->fieldCount); boolean doReplace = (replacesIx != -1); if (doReplace) if (replaceReasonIx == -1) errAbort("Error: got \"%s\" column without \"%s\" column in %s.", replacesTag, replaceReasonTag, submitUrl); /* Loop through and make sure all field values are ok */ struct fieldedRow *fr; for (fr = table->rowList; fr != NULL; fr = fr->next) { char **row = fr->row; char *fileName = row[fileIx]; allGoodFileNameChars(fileName); char *format = row[formatIx]; if (!isSupportedFormat(format)) errAbort("Format %s is not supported", format); allGoodSymbolChars(row[outputIx]); char *experiment = row[experimentIx]; if (!isExperimentId(experiment)) errAbort("%s in experiment field does not seem to be an encode experiment", experiment); char *replicate = row[replicateIx]; if (differentString(replicate, "pooled") && differentString(replicate, "n/a") ) if (!isAllNum(replicate)) errAbort("%s is not a good value for the replicate column", replicate); char *enriched = row[enrichedIx]; if (!encode3CheckEnrichedIn(enriched)) errAbort("Enriched_in %s is not supported", enriched); char *md5 = row[md5Ix]; if (strlen(md5) != 32 || !isAllHexLower(md5)) errAbort("md5 '%s' is not in all lower case 32 character hexadecimal format.", md5); char *size = row[sizeIx]; if (!isAllNum(size)) errAbort("Invalid size '%s'", size); char *modified = row[modifiedIx]; if (!isAllNum(modified)) errAbort("Invalid modification time '%s'", modified); char *validIn = row[validIx]; char *realValid = encode3CalcValidationKey(md5, sqlLongLong(size)); if (!sameString(validIn, realValid)) errAbort("The valid_key %s for %s doesn't fit", validIn, fileName); freez(&realValid); if (doReplace) { char *replaces = row[replacesIx]; char *reason = row[replaceReasonIx]; if (!isEmptyOrNa(replaces)) { char *prefix = edwLicensePlateHead(conn); if (!startsWith(prefix, replaces)) errAbort("%s in replaces column is not an ENCODE file accession", replaces); if (isEmptyOrNa(reason)) errAbort("Replacing %s without a reason\n", replaces); } } } *retSubmitList = submitFileRowFromFieldedTable(conn, table, fileIx, md5Ix, sizeIx, modifiedIx, replacesIx, replaceReasonIx); }