struct edwFile *edwFileFromId(struct sqlConnection *conn, long long fileId) /* Return edwValidFile given fileId - return NULL if not found. */ { char query[128]; sqlSafef(query, sizeof(query), "select * from edwFile where id=%lld", fileId); return edwFileLoadByQuery(conn, query); }
struct edwFile *edwFileLoadIdRange(struct sqlConnection *conn, long long startId, long long endId) /* Return list of all files in given id range */ { char query[256]; sqlSafef(query, sizeof(query), "select * from edwFile where id>=%lld and id<=%lld and endUploadTime != 0 " "and updateTime != 0 and deprecated = ''", startId, endId); return edwFileLoadByQuery(conn, query); }
void edwFixRevoked(char *database, char *inFile) /* edwFixRevoked - Mark as deprecated files that are revoked in ENCODE2. */ /* inFile is in format: * metaVariable objStatus revoked [- reason] * metaObject name */ { struct sqlConnection *conn = edwConnect(); struct lineFile *lf = lineFileOpen(inFile, TRUE); char *line; char *defaultReason = "Revoked in ENCODE2"; char *reason = defaultReason; while (lineFileNextReal(lf, &line)) { if (startsWithWord("metaVariable", line)) { char *pattern = "metaVariable objStatus revoked"; if (startsWithWord(pattern, line)) { reason = skipLeadingSpaces(line + strlen(pattern)); if (isEmpty(reason)) reason = defaultReason; else { if (reason[0] == '-') reason = skipLeadingSpaces(reason + 1); reason = cloneString(reason); } } else errAbort("??? %s\n", line); } else if (startsWithWord("metaObject", line)) { char *row[3]; int wordCount = chopLine(line, row); if (wordCount != 2) errAbort("Strange metaobject line %d of %s\n", lf->lineIx, lf->fileName); char *prefix = row[1]; if (!startsWith("wgEncode", prefix)) errAbort("Strange object line %d of %s\n", lf->lineIx, lf->fileName); char query[512]; sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%s%%'", database, prefix); struct edwFile *ef, *efList = edwFileLoadByQuery(conn, query); printf("# %s %s\n", prefix, reason); for (ef = efList; ef != NULL; ef = ef->next) { long long id = ef->id; printf("update edwFile set deprecated='%s' where id=%lld;\n", reason, id); } } else errAbort("Unrecognized first word in %s\n", line); } }
struct edwFile *edwFileInProgress(struct sqlConnection *conn, int submitId) /* Return file in submission in process of being uploaded if any. */ { char query[256]; sqlSafef(query, sizeof(query), "select fileIdInTransit from edwSubmit where id=%u", submitId); long long fileId = sqlQuickLongLong(conn, query); if (fileId == 0) return NULL; sqlSafef(query, sizeof(query), "select * from edwFile where id=%lld", (long long)fileId); return edwFileLoadByQuery(conn, query); }
struct edwFile *edwFileAllIntactBetween(struct sqlConnection *conn, int startId, int endId) /* Return list of all files that are intact (finished uploading and MD5 checked) * with file IDs between startId and endId - including endId */ { char query[256]; sqlSafef(query, sizeof(query), "select * from edwFile where id>=%d and id<=%d and endUploadTime != 0 " "and updateTime != 0 and errorMessage = '' and deprecated = ''", startId, endId); return edwFileLoadByQuery(conn, query); }
void edwFixGtfBigBed(char *how) /* edwFixGtfBigBed - In original import the .gtf.bigBed files were bad about half the time. Cricket * caught this because a bunch of them ended up with the same md5 sum. This program regenerates them * all. */ { doReal = sameString(how, "real"); struct sqlConnection *conn = edwConnectReadWrite(); struct edwFile *redoEf, *redoList = edwFileLoadByQuery(conn, "select * from edwFile where submitFileName like '%.gtf.bigBed'"); for (redoEf = redoList; redoEf != NULL; redoEf = redoEf->next) { redoOne(conn, redoEf); } }
void edwMakeReplicateQa(int startId, int endId) /* edwMakeReplicateQa - Do qa level comparisons of replicates.. */ { /* Make list with all files in ID range */ struct sqlConnection *conn = sqlConnect(edwDatabase); char query[256]; sqlSafef(query, sizeof(query), "select * from edwFile where id>=%d and id<=%d and endUploadTime != 0 " "and updateTime != 0 and deprecated = ''", startId, endId); struct edwFile *ef, *efList = edwFileLoadByQuery(conn, query); for (ef = efList; ef != NULL; ef = ef->next) { doReplicateQa(conn, ef); } }
void redoOne(struct sqlConnection *conn, struct edwFile *redoEf) /* Redo one file. */ { /* Figure out submit file name of the gtf file. */ char gtfFileName[PATH_LEN]; strcpy(gtfFileName, redoEf->submitFileName); chopSuffix(gtfFileName); strcat(gtfFileName, ".gz"); /* Get edwFile record for gtf file. */ char query[PATH_LEN+64]; safef(query, sizeof(query), "select * from edwFile where submitFileName='%s'", gtfFileName); struct edwFile *sourceEf = edwFileLoadByQuery(conn, query); assert(slCount(sourceEf) == 1); /* Get UCSC database */ safef(query, sizeof(query), "select ucscDb from edwValidFile where fileId=%u", sourceEf->id); char ucscDb[64] = ""; sqlQuickQuery(conn, query, ucscDb, sizeof(ucscDb)); assert(ucscDb[0] != 0); /* Remake the big bed file. */ char sourceFileName[PATH_LEN], destFileName[PATH_LEN]; safef(sourceFileName, sizeof(sourceFileName), "%s%s", edwRootDir, sourceEf->edwFileName); safef(destFileName, sizeof(destFileName), "%s%s", edwRootDir, redoEf->edwFileName); makeGtfBigBed(ucscDb, sourceFileName, destFileName); /* Recalculate size and md5 sum and validation key. */ char *md5 = md5HexForFile(destFileName); long long size = fileSize(destFileName); char *validKey = encode3CalcValidationKey(md5, size); /* Issue command to update md5 in database. */ char command[2*PATH_LEN]; safef(command, sizeof(command), "hgsql -e 'update edwFile set md5=\"%s\" where id=%u' encodeDataWarehouse", md5, redoEf->id); doSystem(command); /* Issue command to update tags in database. */ char *newTags = cgiStringNewValForVar(redoEf->tags, "valid_key", validKey); if (doReal) { edwFileResetTags(conn, redoEf, newTags); } }
void edwScriptSubmitStatus() /* edwScriptSubmitStatus - Programatically check status of submission.. */ { /* Pause a second - prevent inadvertent harsh denial of service from scripts. */ sleep(2); edwScriptRegistryFromCgi(); /* Get submission from url. */ struct sqlConnection *conn = edwConnect(); char query[512]; char *url = cgiString("url"); struct edwSubmit *sub = edwMostRecentSubmission(conn, url); char *status = NULL; if (sub == NULL) { int posInQueue = edwSubmitPositionInQueue(conn, url, NULL); if (posInQueue == -1) errAbort("%s has not been submitted", url); else status = "pending"; } else { time_t endUploadTime = sub->endUploadTime; if (!isEmpty(sub->errorMessage)) { status = "error"; } else if (endUploadTime == 0) { status = "uploading"; } else { safef(query, sizeof(query), "select count(*) from edwFile where submitId=%u and errorMessage != ''", sub->id); int errCount = sqlQuickNum(conn, query); int newValid = edwSubmitCountNewValid(sub, conn); if (newValid + errCount < sub->newFiles) status = "validating"; else if (errCount > 0) status = "error"; else status = "success"; } } /* Construct JSON result */ struct dyString *dy = dyStringNew(0); dyStringPrintf(dy, "{\n"); dyStringPrintf(dy, " \"status\": \"%s\"", status); if (sameString(status, "error")) { dyStringPrintf(dy, ",\n"); dyStringPrintf(dy, " \"errors\": [\n"); int errCount = 0; if (!isEmpty(sub->errorMessage)) { addErrFile(dy, errCount, sub->url, sub->errorMessage); ++errCount; } safef(query, sizeof(query), "select * from edwFile where submitId=%u and errorMessage != ''", sub->id); struct edwFile *file, *fileList = edwFileLoadByQuery(conn, query); for (file = fileList; file != NULL; file = file->next) { addErrFile(dy, errCount, file->submitFileName, file->errorMessage); ++errCount; } dyStringPrintf(dy, "\n ]\n"); dyStringPrintf(dy, "}\n"); } else { dyStringPrintf(dy, "\n}\n"); } /* Write out HTTP response */ printf("Content-Length: %d\r\n", dy->stringSize); puts("Content-Type: application/json; charset=UTF-8\r"); puts("\r"); printf("%s", dy->string); }
void edwFixReplaced(char *database, char *inTab, char *spikedTab, char *outSql, char *outRa) /* edwFixReplaced - Clean up files that were replaced in ENCODE2. */ { struct sqlConnection *conn = edwConnect(); struct lineFile *lf = lineFileOpen(inTab, TRUE); FILE *fSql = mustOpen(outSql, "w"); FILE *fRa = mustOpen(outRa, "w"); char *row[2]; struct hash *renameHash = rootRenameHash(); struct hash *spikedHash = hashTwoColumnFile(spikedTab); int depCount = 0, repCount = 0; while (lineFileRowTab(lf, row)) { /* Get fields in local variables. */ char *oldFileName = row[0]; char *objStatus = row[1]; /* Do spikein rename lookup. */ char *spiked = hashFindVal(spikedHash, oldFileName); if (spiked != NULL) { verbose(2, "renaming spikeing %s to %s\n", oldFileName, spiked); oldFileName = spiked; } /* Get rid of bai name for bam,bai pairs. */ char *comma = strchr(oldFileName, ','); if (comma != NULL) { if (!endsWith(comma, ".bai")) errAbort("Unexpected conjoining of files line %d of %s", lf->lineIx, lf->fileName); *comma = 0; } /* For .fastq.tgz files we got to unpack them. */ if (endsWith(oldFileName, ".fastq.tgz")) { /* Get root name - name minus suffix */ char *oldRoot = cloneString(oldFileName); chopSuffix(oldRoot); chopSuffix(oldRoot); verbose(2, "Processing fastq.tgz %s %s\n", oldFileName, oldRoot); // Find records for old version. char query[512]; sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%s.fastq.tgz.dir/%%'" " order by submitFileName", database, oldRoot); struct edwFile *oldList = edwFileLoadByQuery(conn, query); int oldCount = slCount(oldList); if (oldCount == 0) errAbort("No records match %s", query); // Find record for replaced version. // Fortunately all of the fastq.tgz's are just V2, which simplifies code a bit sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%sV2.fastq.tgz.dir/%%'" " order by submitFileName", database, oldRoot); struct edwFile *newList = edwFileLoadByQuery(conn, query); int newCount = slCount(newList); if (newCount == 0) errAbort("No records match %s", query); // Make a hash of new records keyed by new file name inside of tgz struct edwFile *newEf; struct hash *newHash = hashNew(0); for (newEf = newList; newEf != NULL; newEf = newEf->next) { char fileName[FILENAME_LEN]; splitPath(newEf->submitFileName, NULL, fileName, NULL); hashAdd(newHash, fileName, newEf); verbose(2, " %s\n", fileName); } verbose(2, "%d in oldList, %d in newList\n", oldCount, newCount); // Loop through old records trying to find corresponding new record struct edwFile *oldEf; for (oldEf = oldList; oldEf != NULL; oldEf = oldEf->next) { char fileName[FILENAME_LEN]; splitPath(oldEf->submitFileName, NULL, fileName, NULL); struct edwFile *newEf = hashFindVal(newHash, fileName); char *newName = "n/a"; fprintf(fSql, "update edwFile set deprecated='%s' where id=%u;\n", objStatus, oldEf->id); ++depCount; if (newEf != NULL) { fprintf(fSql, "update edwFile set replacedBy=%u where id=%u;\n", newEf->id, oldEf->id); newName = newEf->submitFileName; ++repCount; } fprintf(fRa, "objStatus %s\n", objStatus); fprintf(fRa, "oldFile %s\n", oldEf->submitFileName); fprintf(fRa, "newFile %s\n", newName); fprintf(fRa, "\n"); verbose(2, "%s -> %s\n", oldEf->submitFileName, newName); } } else { /* Figure out new file name by either adding V2 at end, or if there is already a V#, * replacing it. */ #ifdef SOON #endif /* SOON */ int oldVersion = 1; char *noVersion = NULL; { /* Split old file name into root and suffix. */ char *suffix = edwFindDoubleFileSuffix(oldFileName); if (suffix == NULL) errAbort("No suffix in %s line %d of %s", oldFileName, lf->lineIx, lf->fileName); char *oldRoot = cloneStringZ(oldFileName, suffix - oldFileName); char *renamed = hashFindVal(renameHash, oldRoot); if (renamed != NULL) { verbose(2, "Overriding %s with %s\n", oldRoot, renamed); oldRoot = cloneString(renamed); } /* Look for V# at end of old root, and if it's there chop it off and update oldVersion */ noVersion = oldRoot; // If no V, we done. */ char *vPos = strrchr(oldRoot, 'V'); if (vPos != NULL) { char *numPos = vPos + 1; int numSize = strlen(numPos); if (numSize == 1 || numSize == 2) { if (isAllDigits(numPos)) { oldVersion = atoi(numPos); *vPos = 0; } else errAbort("Expecting numbers after V in file name got %s line %d of %s", numPos, lf->lineIx, lf->fileName); } } verbose(2, "%s parses to %s %d %s\n", oldFileName, noVersion, oldVersion, suffix); /* Find record for old file. */ char query[512]; sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%s'", database, oldFileName); struct edwFile *oldEf = edwFileLoadByQuery(conn, query); if (slCount(oldEf) != 1) errAbort("Expecting one result got %d for %s\n", slCount(oldEf), query); fprintf(fSql, "# %s %s\n", oldFileName, objStatus); verbose(2, "%s: %s\n", oldFileName, objStatus); /* Find record for new file. */ struct edwFile *newEf = NULL; int newVersion; for (newVersion = oldVersion+1; newVersion < 7; ++newVersion) { sqlSafef(query, sizeof(query), "select * from edwFile where submitFileName like '%s/%%/%sV%d%s'", database, noVersion, newVersion, suffix); newEf = edwFileLoadByQuery(conn, query); if (newEf != NULL) break; } if (newEf == NULL) verbose(2, "Could not find next version of %s (%s)", oldFileName, oldRoot); if (slCount(newEf) > 1) errAbort("Expecting one result got %d for %s\n", slCount(newEf), query); long long oldId = oldEf->id; fprintf(fSql, "update edwFile set deprecated='%s' where id=%lld;\n", objStatus, oldId); ++depCount; char *newName = "n/a"; if (newEf != NULL) { long long newId = newEf->id; fprintf(fSql, "update edwFile set replacedBy=%lld where id=%lld;\n", newId, oldId); newName = newEf->submitFileName; ++repCount; } fprintf(fRa, "objStatus %s\n", objStatus); fprintf(fRa, "oldFile %s\n", oldEf->submitFileName); fprintf(fRa, "newFile %s\n", newName); fprintf(fRa, "\n"); verbose(2, "%s -> %s\n", oldEf->submitFileName, newName); } } } verbose(1, "%d deprecated, %d replaced\n", depCount, repCount); carefulClose(&fSql); carefulClose(&fRa); }
struct edwFile *edwGetLocalFile(struct sqlConnection *conn, char *localAbsolutePath, char *symLinkMd5Sum) /* Get record of local file from database, adding it if it doesn't already exist. * Can make it a symLink rather than a copy in which case pass in valid MD5 sum * for symLinkM5dSum. */ { /* First do a reality check on the local absolute path. Is there a file there? */ if (localAbsolutePath[0] != '/') errAbort("Using relative path in edwAddLocalFile."); long long size = fileSize(localAbsolutePath); if (size == -1) errAbort("%s does not exist", localAbsolutePath); long long updateTime = fileModTime(localAbsolutePath); /* Get file if it's in database already. */ int submitDirId = getLocalSubmitDir(conn); int submitId = getLocalSubmit(conn); char query[256+PATH_LEN]; sqlSafef(query, sizeof(query), "select * from edwFile where submitId=%d and submitFileName='%s'", submitId, localAbsolutePath); struct edwFile *ef = edwFileLoadByQuery(conn, query); /* If we got something in database, check update time and size, and if it's no change just * return existing database id. */ if (ef != NULL && ef->updateTime == updateTime && ef->size == size) return ef; /* If we got here, then we need to make a new file record. Start with pretty empty record * that just has file ID, submitted file name and a few things*/ sqlSafef(query, sizeof(query), "insert edwFile (submitId,submitDirId,submitFileName,startUploadTime) " " values(%d, %d, '%s', %lld)" , submitId, submitDirId, localAbsolutePath, edwNow()); sqlUpdate(conn, query); long long fileId = sqlLastAutoId(conn); /* Create big data warehouse file/path name. */ char edwFile[PATH_LEN], edwPath[PATH_LEN]; edwMakeFileNameAndPath(fileId, localAbsolutePath, edwFile, edwPath); /* We're a little paranoid so md5 it */ char *md5; /* Do copy or symbolic linking of file into warehouse managed dir. */ if (symLinkMd5Sum) { md5 = symLinkMd5Sum; makeSymLink(localAbsolutePath, edwPath); } else { copyFile(localAbsolutePath, edwPath); md5 = md5HexForFile(localAbsolutePath); } /* Update file record. */ sqlSafef(query, sizeof(query), "update edwFile set edwFileName='%s', endUploadTime=%lld," "updateTime=%lld, size=%lld, md5='%s' where id=%lld" , edwFile, edwNow(), updateTime, size, md5, fileId); sqlUpdate(conn, query); /* Now, it's a bit of a time waste, but cheap in code, to just load it back from DB. */ sqlSafef(query, sizeof(query), "select * from edwFile where id=%lld", fileId); return edwFileLoadByQuery(conn, query); }