C++ (Cpp) firstWordInLine Examples

Example #1

0

Show file

File: blastToPsl.c Project: apmagalhaes/kentUtils

static void processBlock(struct blastBlock *bb, unsigned flags,
                         FILE* pslFh, FILE* scoreFh)
/* process one gapped alignment block */
{
struct blastGappedAli* ba = bb->gappedAli;
struct blastQuery *bq = ba->query;
struct psl *psl = pslBuildFromHsp(firstWordInLine(bq->query), bq->queryBaseCount, bb->qStart, bb->qEnd, ((bb->qStrand > 0) ? '+' : '-'), bb->qSym,
                                  firstWordInLine(ba->targetName), ba->targetSize, bb->tStart, bb->tEnd, ((bb->tStrand > 0) ? '+' : '-'), bb->tSym,
                                  flags);
if (psl->blockCount > 0 && (bb->eVal <= eVal || eVal == -1))
    outputPsl(bb, flags, psl, pslFh, scoreFh);
pslFree(&psl);
}

Example #2

0

Show file

File: splatCheck1.c Project: CEpBrowser/CEpBrowser--from-UCSC-CGI-BIN

struct hash *hashFaNames(char *fileName)
/* Return a hash full of the names (but not sequence) of all 
 * records in fa file. */
{
struct hash *hash = hashNew(0);
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line;
while (lineFileNext(lf, &line, NULL))
    {
    line = skipLeadingSpaces(line);
    if (line[0] == '>')
        {
	line += 1;
	char *name = firstWordInLine(line);
	if (name == NULL || name[0] == 0)
	    errAbort("Empty name in fasta file line %d of %s", lf->lineIx, lf->fileName);
	if (hashLookup(hash, name))
	    errAbort("Duplicate name %s in fasta file line %d of %s",
	    	name, lf->lineIx, lf->fileName);
	hashAdd(hash, name, NULL);
	}
    }
lineFileClose(&lf);
return hash;
}

Example #3

0

Show file

File: hgPepPred.c Project: elmargb/kentUtils

void oneGenieFile(char *fileName, struct hash *uniq, FILE *f)
/* Process one genie peptide prediction file into known and alt tab files. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line;
int lineSize;
boolean firstTime = TRUE;
char *trans;
boolean skip = FALSE;

/* Do cursory sanity check. */
if (!lineFileNext(lf, &line, &lineSize))
    errAbort("%s is empty", fileName);
if (line[0] != '>')
    errAbort("%s is badly formatted, doesn't begin with '>'", fileName);
lineFileReuse(lf);

while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == '>')
        {
	/* End last line. */
	if (firstTime)
	    firstTime = FALSE;
	else
	    fputc('\n', f);
	trans = firstWordInLine(line+1);
	if (abbr != NULL && startsWith(abbr, trans))
	    trans += strlen(abbr);
	if (hashLookupUpperCase(uniq, trans) != NULL)
	    {
	    warn("Duplicate (case insensitive) '%s' line %d of %s. Ignoring all but first.", trans, lf->lineIx, lf->fileName);
	    skip = TRUE;
	    }
	else
	    {
	    char *upperCase;
	    upperCase = cloneString(trans);
	    touppers(upperCase);
	    hashAdd(uniq, upperCase, NULL);
	    freeMem(upperCase);
	    fprintf(f, "%s\t", trans);
	    skip = FALSE;
	    }
	}
    else if (!skip)
        {
	mustWrite(f, line, lineSize-1);
	}
    }
fputc('\n', f);
lineFileClose(&lf);
}

Example #4

0

Show file

File: hgPepPred.c Project: elmargb/kentUtils

void genericOne(char *fileName, struct hash *uniq, FILE *f)
/* Process one ensemble peptide prediction file into tab delimited
 * output f, using uniq hash to make sure no dupes. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line;
int lineSize;
boolean firstTime = TRUE;
char *trans, transBuf[128];

/* Do cursory sanity check. */
if (!lineFileNext(lf, &line, &lineSize))
    errAbort("%s is empty", fileName);
if (line[0] != '>')
    errAbort("%s is badly formatted, doesn't begin with '>'", fileName);
lineFileReuse(lf);

while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == '>')
        {
	char *upperCase;
	/* End last line. */
	if (firstTime)
	    firstTime = FALSE;
	else
	    fputc('\n', f);
	trans = firstWordInLine(line+1);
	if (abbr != NULL && startsWith(abbr, trans))
	    trans += strlen(abbr);
        if (suffix != NULL)
            {
            safef(transBuf, sizeof(transBuf), "%s%s", trans, suffix);
            trans = transBuf;
            }
	if (hashLookupUpperCase(uniq, trans) != NULL)
	    errAbort("Duplicate (case insensitive) '%s' line %d of %s", trans, lf->lineIx, lf->fileName);
	upperCase = cloneString(trans);
	touppers(upperCase);
	hashAdd(uniq, upperCase, NULL);
	freeMem(upperCase);
	fprintf(f, "%s\t", trans);
	}
    else
        {
	mustWrite(f, line, lineSize-1);
	}
    }
fputc('\n', f);
lineFileClose(&lf);
}

Example #5

0

Show file

File: fastq.c Project: sbonerlab/libbios

static Fastq* fastq_processNextSequence (int freeMemory, int truncateName)
{
  char *line;
  static Fastq* currFQ = NULL;
  int count;
  Seq* currSeq = NULL;

  if (ls_isEof (lsFastq)) {
    if (freeMemory) {
      fastq_freeFastq (currFQ);
    }
    return NULL;
  }
  count = 0;
  while ( (line=ls_nextLine (lsFastq)) && (count<4) ) {
    if (line[0] == '\0') {
      continue;
    }
    if (line[0] == '@') {      
      if (freeMemory) {
	fastq_freeFastq (currFQ);
      }
      count++;
      AllocVar (currFQ);
      AllocVar (currFQ->seq);
      currSeq = currFQ->seq;
      currSeq->name = hlr_strdup (line + 1);
      if (truncateName) {
	currSeq->name = firstWordInLine (skipLeadingSpaces (currSeq->name));
      }
      line = ls_nextLine (lsFastq); // reading sequence
      currSeq->sequence = hlr_strdup ( line );
      currSeq->size = strlen (currSeq->sequence);
      count++;
      line = ls_nextLine (lsFastq); // reading quality ID
      if( line[0] != '+' )
	die("Expected quality ID: '+' or '+%s'", currSeq->name );
      count++;
      line = ls_nextLine (lsFastq); // reading quality
      currFQ->quality = hlr_strdup( line );
      count++;
    } 
  }   
  ls_back (lsFastq,1);
  return currFQ;
}

Example #6

0

Show file

File: fa.c Project: blumroy/kentUtils

boolean faMixedSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName)
/* Read in DNA or Peptide FA record in mixed case.   Allow any upper or lower case
 * letter, or the dash character in. */
{
char c;
int bufIx = 0;
static char name[512];
int lineSize, i;
char *line;

dnaUtilOpen();

/* Read first line, make sure it starts with '>', and read first word
 * as name of sequence. */
name[0] = 0;
if (!lineFileNext(lf, &line, &lineSize))
    {
    *retDna = NULL;
    *retSize = 0;
    return FALSE;
    }
if (line[0] == '>')
    {
    line = firstWordInLine(skipLeadingSpaces(line+1));
    if (line == NULL)
        errAbort("Expecting sequence name after '>' line %d of %s", lf->lineIx, lf->fileName);
    strncpy(name, line, sizeof(name));
    name[sizeof(name)-1] = '\0'; /* Just to make sure name is NULL terminated. */
    }
else
    {
    errAbort("Expecting '>' line %d of %s", lf->lineIx, lf->fileName);
    }
/* Read until next '>' */
for (;;)
    {
    if (!lineFileNext(lf, &line, &lineSize))
        break;
    if (line[0] == '>')
        {
	lineFileReuse(lf);
	break;
	}
    if (bufIx + lineSize >= faFastBufSize)
	expandFaFastBuf(bufIx, lineSize);
    for (i=0; i<lineSize; ++i)
        {
	c = line[i];
	if (isalpha(c) || c == '-')
	    faFastBuf[bufIx++] = c;
	}
    }
if (bufIx >= faFastBufSize)
    expandFaFastBuf(bufIx, 0);
faFastBuf[bufIx] = 0;
*retDna = faFastBuf;
*retSize = bufIx;
*retName = name;
if (bufIx == 0)
    {
    warn("Invalid fasta format: sequence size == 0 for element %s",name);
    }

return TRUE;
}

Example #7

0

Show file

File: hubCheck.c Project: ucscGenomeBrowser/kent

struct trackHubSettingSpec *trackHubSettingsForVersion(char *specHost, char *version)
/* Return list of settings with support level. Version can be version string or spec url */
{
struct htmlPage *page = NULL;
if (version == NULL)
    {
    version = trackHubVersionDefault(specHost, &page);
    if (version == NULL)
        errAbort("Can't get default spec from host %s", specHost);
    }

/* Retrieve specs from file url. 
 * Settings are the first text word within any <code> tag having class="level-" attribute.
 * The level represents the level of support for the setting (e.g. base, full, deprecated)
 * The support level ('level-*') is the class value of the * <code> tag.
 * E.g.  <code class="level-full">boxedConfig on</code> produces:
 *      setting=boxedConfig, class=full */

if (page == NULL)
    page = trackHubVersionSpecMustGet(specHost, version);
if (page == NULL)
    errAbort("Can't get settings spec for version %s from host %s", version, specHost);
verbose(5, "Found %d tags\n", slCount(page->tags));

struct trackHubSettingSpec *spec, *savedSpec;
struct hash *specHash = hashNew(0);
struct htmlTag *tag;
struct htmlAttribute *attr;
char buf[256];
for (tag = page->tags; tag != NULL; tag = tag->next)
    {
    if (differentWord(tag->name, "code"))
        continue;
    attr = tag->attributes;
    if (attr == NULL || differentString(attr->name, "class") || !startsWith("level-", attr->val))
                        continue;
    AllocVar(spec);
    int len = min(tag->next->start - tag->end, sizeof buf - 1);
    memcpy(buf, tag->end, len);
    buf[len] = 0;
    verbose(6, "Found spec: %s\n", buf);
    spec->name = cloneString(firstWordInLine(buf));
    if (spec->name == NULL || strlen(spec->name) == 0)
        {
        warn("ERROR: format problem with trackDbHub.html -- contact UCSC.");
        continue;
        }
    spec->level = cloneString(chopPrefixAt(attr->val, '-'));
    verbose(6, "spec: name=%s, level=%s\n", spec->name, spec->level);
    savedSpec = (struct trackHubSettingSpec *)hashFindVal(specHash, spec->name);
    if (savedSpec != NULL)
        verbose(6, "found spec %s level %s in hash\n", savedSpec->name, savedSpec->level);
    if (savedSpec == NULL)
        {
        hashAdd(specHash, spec->name, spec);
        verbose(6, "added spec %s at level %s\n", spec->name, spec->level);
        }
    else if (trackHubSettingLevelCmp(spec, savedSpec) > 0)
        {
        hashReplace(specHash, spec->name, spec);
        verbose(6, "replaced spec %s at level %s, was %s\n", 
            spec->name, spec->level, savedSpec->level);
        }
    }
struct hashEl *el, *list = hashElListHash(specHash);

int settingsCt = slCount(list);
verbose(5, "Found %d settings's\n", slCount(list));
if (settingsCt == 0)
    errAbort("Can't find hub setting info for version %s (host %s)."
              " Use -version to indicate a different version number or url.", version, specHost);

slSort(&list, hashElCmp);
struct trackHubSettingSpec *specs = NULL;
int baseCt = 0;
int requiredCt = 0;
int deprecatedCt = 0;
for (el = list; el != NULL; el = el->next)
    {
    if (sameString(((struct trackHubSettingSpec *)el->val)->level, "base"))
        baseCt++;
    else if (sameString(((struct trackHubSettingSpec *)el->val)->level, "required"))
        requiredCt++;
    else if (sameString(((struct trackHubSettingSpec *)el->val)->level, "deprecated"))
        deprecatedCt++;
    slAddHead(&specs, el->val);
    }
slReverse(&specs);
verbose(3, 
        "Found %d supported settings for this version (%d required, %d base, %d deprecated)\n",
                slCount(specs), requiredCt, baseCt, deprecatedCt);
return specs;
}

Example #8

0

Show file

File: ctgFaToFa.c Project: bowhan/kent

void ctgFaToFa(char *ctgFa, char *ctgCoords, char *ntDir)
/* ctgFaToFa - Convert from one big file with all NT contigs to one contig per file.. */
{
struct lineFile *lf;
char fileName[512], *line;
char *ntName, *hsName;
char *parts[6];
int lineSize, partCount;
struct hash *uniqHash = newHash(0);
FILE *f = NULL;
int dotMod = 0;
struct hash *ntHash = newHash(0);
struct hash *hsHash = newHash(0);
struct ntContig *nt;
char *words[8];

printf("Loading %s\n", ctgCoords);
lf = lineFileOpen(ctgCoords, TRUE);
while (lineFileRow(lf, words))
    {
    ntName = words[0];
    if ((nt = hashFindVal(ntHash, ntName)) != NULL)
        ++nt->cloneCount;
    else
        {
	AllocVar(nt);
	hashAddSaveName(ntHash, ntName, nt, &nt->name);
	hashAddSaveName(hsHash, words[1], nt, &nt->hsName);
	nt->cloneCount = 1;
	}
    }
lineFileClose(&lf);


lf = lineFileOpen(ctgFa, FALSE);
makeDir(ntDir);
while (lineFileNext(lf, &line, &lineSize))
    {
    if ((++dotMod&0x1ffff) == 0)
        {
	printf(".");
	fflush(stdout);
	}
    if (line[0] == '>')
        {
	carefulClose(&f);
	line[lineSize-1] = 0;
	partCount = chopByChar(line, '|',parts,ArraySize(parts));
	if (partCount < 3)
	    {
	    uglyf("partCount = %d\n", partCount);
	    errAbort("Expecting | separated header line %d of %s", lf->lineIx, lf->fileName); 
	    }
	ntName = parts[1];
	nt = hashFindVal(ntHash, ntName);
	hsName = parts[2];
	if (nt == NULL)
	    {
	    hsName = firstWordInLine(ntName);
	    nt = hashMustFindVal(hsHash, hsName);
	    ntName = nt->name;
	    }
	if (nt->cloneCount > 1)
	    {
	    if (!startsWith("Hs", hsName))
	        errAbort("Expecting %s to start with 'Hs' line %d of %s",
			hsName, lf->lineIx, lf->fileName);
	    if (hashLookup(uniqHash, ntName))
	        ntName = nextFakeNtName(hsName, ntName);
	    hashAddUnique(uniqHash, ntName, NULL);
	    if (!startsWith("NT_", ntName))
		errAbort("Expecting NT_ name line %d of %s", lf->lineIx, lf->fileName); 
	    sprintf(fileName, "%s/%s.fa", ntDir, ntName);
	    f = mustOpen(fileName, "w");
	    fprintf(f, ">%s.1_1\n", ntName);
	    }
	}
    else
        {
	if (f != NULL)
	    mustWrite(f, line, lineSize);
	}
    }
printf("\n");
carefulClose(&f);
lineFileClose(&lf);
}

Example #9

0

Show file

File: reviewSanity.c Project: elmargb/kentUtils

boolean faSpeedReadNextKeepCase(struct lineFile *lf, 
	DNA **retDna, int *retSize, char **retName)
/* Read in next FA entry as fast as we can. Faster than that old,
 * pokey faFastReadNext. Return FALSE at EOF. 
 * The returned DNA and name will be overwritten by the next call
 * to this function. */
{
int c;
int bufIx = 0;
static char name[256];
int lineSize, i;
char *line;

dnaUtilOpen();

/* Read first line, make sure it starts wiht '>', and read first word
 * as name of sequence. */
name[0] = 0;
if (!lineFileNext(lf, &line, &lineSize))
    {
    *retDna = NULL;
    *retSize = 0;
    return FALSE;
    }
if (line[0] == '>')
    {
    line = firstWordInLine(skipLeadingSpaces(line+1));
    if (line == NULL)
        errAbort("Expecting sequence name after '>' line %d of %s", lf->lineIx, lf->fileName);
    strncpy(name, line, sizeof(name));
    }
else
    {
    errAbort("Expecting '>' line %d of %s", lf->lineIx, lf->fileName);
    }
/* Read until next '>' */
for (;;)
    {
    if (!lineFileNext(lf, &line, &lineSize))
        break;
    if (line[0] == '>')
        {
	lineFileReuse(lf);
	break;
	}
    if (bufIx + lineSize >= faFastBufSize)
	expandFaFastBuf(bufIx);
    for (i=0; i<lineSize; ++i)
        {
	c = line[i];
	if (isalpha(c))
	    {
	    faFastBuf[bufIx++] = c;
	    }
	}
    }
if (bufIx >= faFastBufSize)
    expandFaFastBuf(bufIx);
faFastBuf[bufIx] = 0;
*retDna = faFastBuf;
*retSize = bufIx;
*retName = name;
return TRUE;
}

Example #10

0

Show file

File: cartJson.c Project: EffieChantzi/UCSC-Genome-Browser

static struct jsonWrite *rTdbToJw(struct trackDb *tdb, struct hash *fieldHash,
                                  struct hash *excludeTypesHash, int depth, int maxDepth)
/* Recursively build and return a new jsonWrite object with JSON for tdb and its children,
 * or NULL if tdb or all children have been filtered out by excludeTypesHash.
 * If excludeTypesHash is non-NULL, omit any tracks/views/subtracks with type in excludeTypesHash.
 * If fieldHash is non-NULL, include only the field names indexed in fieldHash. */
{
if (maxDepth >= 0 && depth > maxDepth)
    return NULL;
boolean doSubtracks = (tdb->subtracks && fieldOk("subtracks", fieldHash));
// If excludeTypesHash is given and tdb is a leaf track/subtrack, look up the first word
// of tdb->type in excludeTypesHash; if found, return NULL.
if (excludeTypesHash && !doSubtracks)
    {
    char typeCopy[PATH_LEN];
    safecpy(typeCopy, sizeof(typeCopy), tdb->type);
    if (hashLookup(excludeTypesHash, firstWordInLine(typeCopy)))
        return NULL;
    }
boolean gotSomething = !doSubtracks;
struct jsonWrite *jwNew = jsonWriteNew();
jsonWriteObjectStart(jwNew, NULL);
writeTdbSimple(jwNew, tdb, fieldHash);
if (tdb->parent && fieldOk("parent", fieldHash))
    {
    // We can't link to an object in JSON and better not recurse here or else infinite loop.
    if (tdbIsSuperTrackChild(tdb))
        {
        // Supertracks have been omitted from fullTrackList, so add the supertrack object's
        // non-parent/child info here.
        jsonWriteObjectStart(jwNew, "parent");
        writeTdbSimple(jwNew, tdb->parent, fieldHash);
        jsonWriteObjectEnd(jwNew);
        }
    else
        // Just the name so we don't have infinite loops.
        jsonWriteString(jwNew, "parent", tdb->parent->track);
    }
if (doSubtracks)
    {
    jsonWriteListStart(jwNew, "subtracks");
    slSort(&tdb->subtracks, trackDbViewCmp);
    struct trackDb *subTdb;
    for (subTdb = tdb->subtracks;  subTdb != NULL;  subTdb = subTdb->next)
        {
        struct jsonWrite *jwSub = rTdbToJw(subTdb, fieldHash, excludeTypesHash, depth+1, maxDepth);
        if (jwSub)
            {
            gotSomething = TRUE;
            jsonWriteAppend(jwNew, NULL, jwSub);
            jsonWriteFree(&jwSub);
            }
        }
    jsonWriteListEnd(jwNew);
    }
jsonWriteObjectEnd(jwNew);
if (! gotSomething)
    // All children were excluded; clean up and null out jwNew.
    jsonWriteFree(&jwNew);
return jwNew;
}

Example #11

0

Show file

File: faSplit.c Project: CEpBrowser/CEpBrowser--from-UCSC-CGI-BIN

void splitByBase(char *inName, int splitCount, char *outRoot, off_t estSize)
/* Split into a file base by base. */
{
struct lineFile *lf = lineFileOpen(inName, TRUE);
int lineSize;
char *line;
char c;
char dir[PATH_LEN], seqName[128], outFile[128], outPathName[PATH_LEN];
int digits = digitsBaseTen(splitCount);
boolean warnedMultipleRecords = FALSE;
int fileCount = 0;
off_t nextEnd = 0;
off_t curPos = 0;
FILE *f = NULL;
int linePos = 0;
int outLineSize = 50;


if (!lineFileNext(lf, &line, &lineSize))
    errAbort("%s is empty", inName);
if (line[0] == '>')
    {
    line = firstWordInLine(line+1);
    if (line == NULL)
        errAbort("Empty initial '>' line in %s", inName);
    strncpy(seqName, line, sizeof(seqName));
    }
else
    {
    splitPath(inName, dir, seqName, NULL);
    lineFileReuse(lf);
    }
splitPath(outRoot, NULL, outFile, NULL);
while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == '>')
        {
	if (!warnedMultipleRecords)
	    {
	    warnedMultipleRecords = TRUE;
	    warn("More than one record in FA file line %d of %s", 
	    	lf->lineIx, lf->fileName);
	    continue;
	    }
	}
    while ((c = *line++) != 0)
        {
	if (isdigit(c) || isspace(c))
	    continue;
	if (!isalpha(c))
	    errAbort("Weird %c (0x%x) line %d of %s", c, c, lf->lineIx, lf->fileName);
	if (++curPos >= nextEnd)
	    {
	    if (f != NULL)
	        {
		if (linePos != 0)
		    fputc('\n', f);
		fclose(f);
		}
            mkOutPath(outPathName, outRoot, digits, fileCount);
	    verbose(2, "writing %s\n", outPathName);
	    f = mustOpen(outPathName, "w");
	    fprintf(f, ">%s%0*d\n", outFile, digits, fileCount);
	    ++fileCount;
	    linePos = 0;
	    nextEnd = calcNextEnd(fileCount, splitCount, estSize);
	    }
	fputc(c, f);
	if (++linePos >= outLineSize)
	    {
	    fputc('\n', f);
	    linePos = 0;
	    }
	}
    }
if (f != NULL)
    {
    if (linePos != 0)
	fputc('\n', f);
    fclose(f);
    }
lineFileClose(&lf);
}

Example #12

0

Show file

File: faNcbiToUcsc.c Project: sktu/kentUtils

void faNcbiToUcsc(char *inFile, char *out)
/* faNcbiToUcsc - Convert FA file from NCBI to UCSC format.. */
{
struct lineFile *lf = lineFileOpen(inFile, TRUE);
char outName[512];
char *line;
boolean split = cgiBoolean("split");
boolean ntLast = cgiBoolean("ntLast");
boolean encode = cgiBoolean("encode");
struct dnaSeq seq;
FILE *f = NULL;
char *wordBefore = cgiUsualString("wordBefore", "gb");
int wordIx = cgiUsualInt("wordIx", -1);
char *e = NULL;
char *nt = NULL;
ZeroVar(&seq);

if (split)
    makeDir(out);
else
    f = mustOpen(out, "w");
while (lineFileNext(lf, &line, NULL))
    {
    if (line[0] == '>')
	{
	if (ntLast || encode)
	    {
	    nt = NULL;
            if (ntLast)
                {
		e = NULL;
                nt = stringIn("NT_", line);
                if (nt == NULL)
                    nt = stringIn("NG_", line);
                if (nt == NULL)
                    nt = stringIn("NC_", line);
                if (nt == NULL)
                    errAbort("Expecting NT_ NG_ or NC_in '%s'", line);
                e = strchr(nt, '|');
                if (e != NULL) *e = 0;
                e = strchr(nt, ' ');
                if (e != NULL) *e = 0;
                }
            else 
                {
                nt = stringIn("|EN", line);
                if (nt == NULL)
                    errAbort("Expecting EN in %s", line);
                nt++;
                nt = firstWordInLine(nt);
                }
	    if (split)
		{
		sprintf(outName, "%s/%s.fa", out, nt);
		carefulClose(&f);
		f = mustOpen(outName, "w");
		}
	    fprintf(f, ">%s\n", nt);
	    }

        else
	    {
	    char *words[32];
	    int wordCount, i;
	    char *accession = NULL;
	    wordCount = chopString(line+1, "|", words, ArraySize(words));
	    if (wordIx >= 0)
		{
		if (wordIx >= wordCount)
		    errAbort("Sorry only %d words", wordCount);
	        accession = words[wordIx];
		}
	    else
		{
		for (i=0; i<wordCount-1; ++i)
		    {
		    if (sameString(words[i], wordBefore))
			{
			accession = words[i+1];
			break;
			}
		    }
		if (accession == NULL)
		    errAbort("Couldn't find '%s' line %d of %s", 
			    wordBefore, lf->lineIx, lf->fileName);
		}
	    chopSuffix(accession);
	    fprintf(f, ">%s\n", accession);
	    }
	}
    else
        {
	fprintf(f, "%s\n", line);
	}
    }
}

Example #13

0

Show file

File: hgFlyBase.c Project: elmargb/kentUtils

void hgFlyBase(char *database, char *genesFile)
/* hgFlyBase - Parse FlyBase genes.txt file and turn it into a couple of 
 * tables. */
{
char *tGene = "fbGene";
char *tSynonym = "fbSynonym";
char *tAllele = "fbAllele";
char *tRef = "fbRef";
char *tRole = "fbRole";
char *tPhenotype = "fbPhenotype";
char *tTranscript = "fbTranscript";
char *tGo = "fbGo";
char *tUniProt = "fbUniProt";
FILE *fGene = hgCreateTabFile(tabDir, tGene);
FILE *fSynonym = hgCreateTabFile(tabDir, tSynonym);
FILE *fAllele = hgCreateTabFile(tabDir, tAllele);
FILE *fRef = hgCreateTabFile(tabDir, tRef);
FILE *fRole = hgCreateTabFile(tabDir, tRole);
FILE *fPhenotype = hgCreateTabFile(tabDir, tPhenotype);
FILE *fTranscript = NULL;
FILE *fGo = hgCreateTabFile(tabDir, tGo);
FILE *fUniProt = hgCreateTabFile(tabDir, tUniProt);
struct lineFile *lf = lineFileOpen(genesFile, TRUE);
struct hash *refHash = newHash(19);
int nextRefId = 0;
int nextAlleleId = 0;
char *line, sub, type, *rest, *s;
char *geneSym = NULL, *geneName = NULL, *geneId = NULL;
int recordCount = 0;
struct slName *synList = NULL, *syn;
int curAllele = 0, curRef = 0;
struct ref *ref = NULL;
struct sqlConnection *conn;
struct hash *goUniqHash = newHash(18);

/* Make table from flybase genes to BGDP transcripts. */
if (doTranscript)
    {
    fTranscript = hgCreateTabFile(tabDir, tTranscript);
    getAllSplices(database, fTranscript);
    }

/* Make dummy reference for flybase itself. */
fprintf(fRef, "0\tFlyBase\n");

/* Loop through parsing and writing tab files. */
while (lineFileNext(lf, &line, NULL))
    {
    sub = line[0];
    if (sub == '#')
	{
	/* End of record. */
	++recordCount;
	if (geneId == NULL)
	    errAbort("Record without *z line ending line %d of %s",
		lf->lineIx, lf->fileName);

	/* Write out synonyms. */
	s = naForNull(geneSym);
	geneSym = ungreek(s);
	freeMem(s);
	s = naForNull(geneName);
	geneName = ungreek(s);
	if (! sameString(s, "n/a"))
	    freeMem(s);
	if (geneSym != NULL && !sameString(geneSym, "n/a"))
	    slNameStore(&synList, geneSym);
	if (geneName != NULL && !sameString(geneName, "n/a"))
	    slNameStore(&synList, geneName);
	for (syn = synList; syn != NULL; syn = syn->next)
	    {
	    s = ungreek(syn->name);
	    fprintf(fSynonym, "%s\t%s\n", geneId, s);
	    freeMem(s);
	    }

	/* Write out gene record. */
	fprintf(fGene, "%s\t%s\t%s\n", geneId, geneSym, geneName);

	/* Clean up. */
	freez(&geneSym);
	freez(&geneName);
	freez(&geneId);
	slFreeList(&synList);
	ref = NULL;
	curRef = curAllele = 0;
	continue;
	}
    else if (sub == 0)
       errAbort("blank line %d of %s, not allowed in gene.txt",
	    lf->lineIx, lf->fileName);
    else if (isalnum(sub))
       errAbort("line %d of %s begins with %c, not allowed",
	    lf->lineIx, lf->fileName, sub);
    type = line[1];
    rest = trimSpaces(line+2);
    if (sub == '*' && type == 'a')
	geneSym = cloneString(rest);
    else if (sub == '*' && type == 'e')
        geneName = cloneString(rest);
    else if (sub == '*' && type == 'z')
	{
        geneId = cloneString(rest); 
	if (!startsWith("FBgn", geneId))
	    errAbort("Bad FlyBase gene ID %s line %d of %s", geneId, 
		lf->lineIx, lf->fileName);
	}
    else if (type == 'i' && (sub == '*' || sub == '$'))
	{
	if (strlen(rest) > 2)	/* Avoid short useless ones. */
	    slNameStore(&synList, rest);
	}
    else if (sub == '*' && type == 'A')
        {
	if (geneId == NULL)
	    errAbort("Allele before geneId line %d of %s", 
	    	lf->lineIx, lf->fileName);
	curAllele = ++nextAlleleId;
	fprintf(fAllele, "%d\t%s\t%s\n", curAllele, geneId, rest);
	if (!sameString(rest, "classical") &&
	    !sameString(rest, "in vitro") &&
	    !sameString(rest, "wild-type") )
	    {
	    slNameStore(&synList, rest);
	    }
	}
    else if (sub == '*' && type == 'm')
	{
	if (geneId == NULL)
	    errAbort("*m protein ID before geneId line %d of %s", 
	    	lf->lineIx, lf->fileName);
	if (startsWith("UniProt", rest))
	    {
	    char *ptr = strchr(rest, ':');
	    if (ptr != NULL)
		ptr++;
	    else
		errAbort("Trouble parsing UniProt ID %s like %d of %s",
			 rest, lf->lineIx, lf->fileName);
	    fprintf(fUniProt, "%s\t%s\n", geneId, ptr);
	    }
	}
    else if (type == 'E')
        {
	ref = hashFindVal(refHash, rest);
	if (ref == NULL)
	    {
	    AllocVar(ref);
	    ref->id = ++nextRefId;
	    hashAdd(refHash, rest, ref);
	    subChar(rest, '\t', ' ');
	    fprintf(fRef, "%d\t%s\n", ref->id, rest);
	    }
	curRef = ref->id;
	}
    else if ((type == 'k' || type == 'r' || type == 'p') && sub != '@')
        {
	FILE *f = (type == 'r' ? fRole : fPhenotype);
	struct dyString *dy = suckSameLines(lf, line);
	subChar(dy->string, '\t', ' ');
	if (geneId == NULL)
	    errAbort("Expecting *z in record before line %d of %s",
	    	lf->lineIx, lf->fileName);
	fprintf(f, "%s\t%d\t%d\t%s\n", geneId, curAllele, curRef, dy->string);
	dyStringFree(&dy);
	}
    else if (type == 'd' || type == 'f' || type == 'F')
	{
	FILE *f = fGo;
	char aspect = (type == 'd') ? 'P' : (type == 'f') ? 'C' : 'F';
	char *goId = rest;
	char *p = strstr(goId, " ; ");
	char assoc[128];
	if (p == NULL)
	    continue;
	else
	    goId = firstWordInLine(p + 3);
	safef(assoc, sizeof(assoc), "%s.%s", geneId, goId);
	if (hashLookup(goUniqHash, assoc) == NULL)
	    {
	    hashAddInt(goUniqHash, assoc, 1);
	    fprintf(f, "%s\t%s\t%c\n", geneId, goId, aspect);
	    }
	}
    }
printf("Processed %d records in %d lines\n", recordCount, lf->lineIx);
lineFileClose(&lf);

conn = sqlConnect(database);
remakeTables(conn);

if (doLoad)
    {
    printf("Loading %s\n", tGene);
    hgLoadTabFile(conn, tabDir, tGene, &fGene);
    if (doTranscript)
	{
	printf("Loading %s\n", tTranscript);
	hgLoadTabFile(conn, tabDir, tTranscript, &fTranscript);
	}
    printf("Loading %s\n", tSynonym);
    hgLoadTabFile(conn, tabDir, tSynonym, &fSynonym);
    printf("Loading %s\n", tAllele);
    hgLoadTabFile(conn, tabDir, tAllele, &fAllele);
    printf("Loading %s\n", tRef);
    hgLoadTabFile(conn, tabDir, tRef, &fRef);
    printf("Loading %s\n", tRole);
    hgLoadTabFile(conn, tabDir, tRole, &fRole);
    printf("Loading %s\n", tPhenotype);
    hgLoadTabFile(conn, tabDir, tPhenotype, &fPhenotype);
    printf("Loading %s\n", tGo);
    hgLoadTabFile(conn, tabDir, tGo, &fGo);
    printf("Loading %s\n", tUniProt);
    hgLoadTabFile(conn, tabDir, tUniProt, &fUniProt);
    hgRemoveTabFile(tabDir, tGene);
    if (doTranscript)
	hgRemoveTabFile(tabDir, tTranscript);
    hgRemoveTabFile(tabDir, tSynonym);
    hgRemoveTabFile(tabDir, tAllele);
    hgRemoveTabFile(tabDir, tRef);
    hgRemoveTabFile(tabDir, tRole);
    hgRemoveTabFile(tabDir, tPhenotype);
    hgRemoveTabFile(tabDir, tGo);
    hgRemoveTabFile(tabDir, tUniProt);
    }
}