예제 #1
0
void edwCorrectFileTags(char *tabFileName)
/* edwCorrectFileTags - Use this to correct tags in the edwFile table and corresponding fields 
 * in the edwValidFile table without forcing a validateManifest rerun or a reupload.. */
{
struct sqlConnection *conn = edwConnectReadWrite();
char *requiredFields[] = {"accession",};
char *forbiddenFields[] = {"md5_sum", "size", "valid_key", "file_name"};
struct fieldedTable *table = fieldedTableFromTabFile(tabFileName, tabFileName,
	requiredFields, ArraySize(requiredFields));
checkForbiddenFields(table, forbiddenFields, ArraySize(forbiddenFields));
int accessionIx = stringArrayIx("accession", table->fields, table->fieldCount);

struct fieldedRow *fr;
for (fr = table->rowList; fr != NULL; fr = fr->next)
    {
    char *acc = fr->row[accessionIx];
    long long id = edwNeedFileIdForLicensePlate(conn, acc);
    struct edwFile *ef = edwFileFromId(conn, id);
    int i;
    char *tags = ef->tags;
    for (i=0; i<table->fieldCount; ++i)
        {
	if (i != accessionIx)
	    tags = cgiStringNewValForVar(tags, table->fields[i], fr->row[i]);
	}
    edwFileResetTags(conn, ef, tags);
    edwFileFree(&ef);
    }
}
void addSdrfToStormTop(char *sdrfFile, struct tagStorm *storm)
/* Add lines of sdrfFile as children of first top level stanza in storm. */
{
struct fieldedTable *table = fieldedTableFromTabFile(sdrfFile, sdrfFile, NULL, 0 );


/* Convert ArrayExpress field names to our field names */
int fieldIx;
char *lastNonTerm = NULL;
char *lastNonUnit = NULL;
for (fieldIx=0; fieldIx < table->fieldCount; fieldIx += 1)
    {
    char tagName[256];
    aeFieldToNormalField("sdrf.", table->fields[fieldIx], tagName, sizeof(tagName));
    if (lastNonTerm != NULL && sameString("sdrf.Term_Source_REF", tagName))
	 {
         safef(tagName, sizeof(tagName), "%s_Term_Source_REF", lastNonTerm);
	 table->fields[fieldIx] = lmCloneString(table->lm, tagName);
	 }
    else if (lastNonTerm != NULL && sameString("sdrf.Term_Accession_Number", tagName))
	 {
         safef(tagName, sizeof(tagName), "%s_Term_Accession_Number", lastNonTerm);
	 table->fields[fieldIx] = lmCloneString(table->lm, tagName);
	 }
    else if (lastNonUnit != NULL && startsWith("sdrf.Unit_", tagName))
         {
	 safef(tagName, sizeof(tagName), "%s_Unit", lastNonUnit);
	 lastNonTerm = lmCloneString(table->lm, tagName);
	 table->fields[fieldIx] = lastNonTerm;
	 }
    else
	 {
         lastNonTerm = lastNonUnit = lmCloneString(table->lm, tagName);
	 table->fields[fieldIx] = lastNonTerm;
	 }
    }


/* Make up fastq field indexes to handle processing of paired reads in fastq, which
 * take two lines of sdrf file. */
char *fieldsWithFastqs[] = 
/* Fields that contain the fastq file names */
    {
    "sdrf.Comment_FASTQ_URI",
    "sdrf.Comment_SUBMITTED_FILE_NAME",
    "sdrf.Scan_Name",
    };
boolean mightReuseStanza = TRUE;
bool *reuseMultiFields;  // If set this field can vary and line still reused
AllocArray(reuseMultiFields, table->fieldCount);
int i;
for (i=0; i<ArraySize(fieldsWithFastqs); ++i)
    {
    char *field = fieldsWithFastqs[i];
    int ix = stringArrayIx(field, table->fields, table->fieldCount);
    if (ix >=0)
	reuseMultiFields[ix] = TRUE;
    else if (i == 0)
	{
	mightReuseStanza = FALSE;
        break;	    // Make sure has first one if going to do paired read fastq processing
	}
    }


/* Make up a list and hash of fieldMergers to handle conversion of columns that occur
 * multiple times to a comma-separated list of values in a single column. */
struct fieldMerger
/* Something to help merge multiple columns with same name */
    {
    struct fieldMerger *next;	/* Next in list */
    char *name;	
    struct dyString *val;	/* Comma separated value */
    };
struct hash *fieldHash = hashNew(0);
struct fieldMerger *fmList = NULL;
for (fieldIx = 0; fieldIx < table->fieldCount; ++fieldIx)
    {
    char *fieldName = table->fields[fieldIx];
    if (hashLookup(fieldHash, fieldName) == NULL)
        {
	struct fieldMerger *fm;
	AllocVar(fm);
	fm->name = fieldName;
	fm->val = dyStringNew(0);
	slAddTail(&fmList, fm);
	hashAdd(fieldHash, fieldName, fm);
	}
    }

/* Grab top level stanza and make sure there is only one. */
struct tagStanza *topStanza = storm->forest;
if (topStanza == NULL || topStanza->next != NULL)
    internalErr();

/* Scan through table, making new stanzas for each row and hooking them into topStanza */
struct fieldedRow *fr, *lastFr = NULL;
struct tagStanza *stanza = NULL;
for (fr = table->rowList; fr != NULL; fr = fr->next)
    {
    /* Empty out any existing vals */
    struct fieldMerger *fm;
    for (fm = fmList; fm != NULL; fm = fm->next)
	dyStringClear(fm->val);

    /* Add all non-empty values from this row to our fieldMergers. */
    char **row = fr->row;
    for (fieldIx = 0; fieldIx < table->fieldCount; ++fieldIx)
        {
	char *fieldName = table->fields[fieldIx];
	fm = hashMustFindVal(fieldHash, fieldName);
	char *val = row[fieldIx];
	if (!isEmpty(val))
	    csvEscapeAndAppend(fm->val, val);
	}

    /* If only the reuseMultiFields are varying, append to those values in previous stanza,
     * otherwise make a new stanza */
    if (mightReuseStanza && lastFr != NULL 
        && sameExceptForSome(lastFr->row, fr->row, table->fieldCount, reuseMultiFields))
	{
	int i;
	for (i=0; i<ArraySize(fieldsWithFastqs); ++i)
	    {
	    char *fieldName = fieldsWithFastqs[i];
	    if ((fm = hashFindVal(fieldHash, fieldName)) != NULL)
	        {
		char *newVal = fm->val->string;
		char *oldVal = tagMustFindVal(stanza, fieldName);
		int bothSize = strlen(newVal) + strlen(oldVal) + 1 + 1;
		char bothBuf[bothSize];
		safef(bothBuf, bothSize, "%s,%s", oldVal, newVal);
		tagStanzaUpdateTag(storm, stanza, fieldName, bothBuf);
		}
	    }
	}
    else
        {
	/* Output all nonempty vals to stanza */
	stanza = tagStanzaNew(storm, topStanza);
	for (fm = fmList; fm != NULL; fm = fm->next)
	    if (fm->val->stringSize > 0)
		tagStanzaAppend(storm, stanza, fm->name, fm->val->string);
	}

    lastFr = fr;
    }
slReverse(&topStanza->children);
}
예제 #3
0
void edwParseSubmitFile(struct sqlConnection *conn, char *submitLocalPath, char *submitUrl, 
    struct submitFileRow **retSubmitList)
/* Load and parse up this file as fielded table, make sure all required fields are there,
 * and calculate indexes of required fields.   This produces an edwFile list, but with
 * still quite a few fields missing - just what can be filled in from submit filled in. 
 * The submitUrl is just used for error reporting.  If it's local, just make it the
 * same as submitLocalPath. */
{
char *requiredFields[] = {"file_name", "format", "output_type", "experiment", "replicate", 
    "enriched_in", "md5_sum", "size",  "modified", "valid_key"};
struct fieldedTable *table = fieldedTableFromTabFile(submitLocalPath, submitUrl,
	requiredFields, ArraySize(requiredFields));

/* Get offsets of all required fields */
int fileIx = stringArrayIx("file_name", table->fields, table->fieldCount);
int formatIx = stringArrayIx("format", table->fields, table->fieldCount);
int outputIx = stringArrayIx("output_type", table->fields, table->fieldCount);
int experimentIx = stringArrayIx("experiment", table->fields, table->fieldCount);
int replicateIx = stringArrayIx("replicate", table->fields, table->fieldCount);
int enrichedIx = stringArrayIx("enriched_in", table->fields, table->fieldCount);
int md5Ix = stringArrayIx("md5_sum", table->fields, table->fieldCount);
int sizeIx = stringArrayIx("size", table->fields, table->fieldCount);
int modifiedIx = stringArrayIx("modified", table->fields, table->fieldCount);
int validIx = stringArrayIx("valid_key", table->fields, table->fieldCount);

/* See if we're doing replacement and check have all columns needed if so. */
int replacesIx = stringArrayIx(replacesTag, table->fields, table->fieldCount);
int replaceReasonIx = stringArrayIx(replaceReasonTag, table->fields, table->fieldCount);
boolean doReplace = (replacesIx != -1);
if (doReplace)
    if (replaceReasonIx == -1)
        errAbort("Error: got \"%s\" column without \"%s\" column in %s.", 
	    replacesTag, replaceReasonTag, submitUrl);

/* Loop through and make sure all field values are ok */
struct fieldedRow *fr;
for (fr = table->rowList; fr != NULL; fr = fr->next)
    {
    char **row = fr->row;
    char *fileName = row[fileIx];
    allGoodFileNameChars(fileName);
    char *format = row[formatIx];
    if (!isSupportedFormat(format))
	errAbort("Format %s is not supported", format);
    allGoodSymbolChars(row[outputIx]);
    char *experiment = row[experimentIx];
    if (!isExperimentId(experiment))
        errAbort("%s in experiment field does not seem to be an encode experiment", experiment);
    char *replicate = row[replicateIx];
    if (differentString(replicate, "pooled") && differentString(replicate, "n/a") )
	if (!isAllNum(replicate))
	    errAbort("%s is not a good value for the replicate column", replicate);
    char *enriched = row[enrichedIx];
    if (!encode3CheckEnrichedIn(enriched))
        errAbort("Enriched_in %s is not supported", enriched);
    char *md5 = row[md5Ix];
    if (strlen(md5) != 32 || !isAllHexLower(md5))
        errAbort("md5 '%s' is not in all lower case 32 character hexadecimal format.", md5);
    char *size = row[sizeIx];
    if (!isAllNum(size))
        errAbort("Invalid size '%s'", size);
    char *modified = row[modifiedIx];
    if (!isAllNum(modified))
        errAbort("Invalid modification time '%s'", modified);
    char *validIn = row[validIx];
    char *realValid = encode3CalcValidationKey(md5, sqlLongLong(size));
    if (!sameString(validIn, realValid))
        errAbort("The valid_key %s for %s doesn't fit", validIn, fileName);
    freez(&realValid);

    if (doReplace)
	{
	char *replaces = row[replacesIx];
	char *reason = row[replaceReasonIx];
	if (!isEmptyOrNa(replaces))
	    {
	    char *prefix = edwLicensePlateHead(conn);
	    if (!startsWith(prefix, replaces))
		errAbort("%s in replaces column is not an ENCODE file accession", replaces);
	    if (isEmptyOrNa(reason))
		errAbort("Replacing %s without a reason\n", replaces);
	    }
	}
    }

*retSubmitList = submitFileRowFromFieldedTable(conn, table, 
    fileIx, md5Ix, sizeIx, modifiedIx, replacesIx, replaceReasonIx);
}