/** * Returns a pointer to next ElandQuery. * @pre The module has been initialized using elandParser_init(). * Parse entries of the following format: \verbatim >FC30H5TAA_100308:2:1:1647:1161 GCTTACATTTTTCCTCTCTACATTATC U0 1 0 0 chr17.fa 8466296 F .. >FC30H5TAA_100308:2:1:1588:122 GAGTTAGCCTTGGGACCCCTACTTCTT U0 1 0 0 chr3.fa 61525628 F .. >FC30H5TAA_100308:2:1:1642:123 GGTGAGAGCCGCGACGGGCTTTAGGCG NM 0 0 0 >FC30H5TAA_100308:2:1:1630:119 CCGCCATTGCCAGCCCCCAGCTGACGG R2 0 0 2 >FC30H5TAA_100308:2:1:1603:120 GCAAGATGAAGTGAAAGGTAAAGAATC U1 0 1 1 chrM.fa 15277 R .. 26A \endverbatim */ ElandQuery* elandParser_nextQuery (void) { WordIter w; char *line,*token,*pos; static ElandQuery *currElandQuery = NULL; while (line = ls_nextLine (ls)) { if (line[0] == '\0') { continue; } elandParser_freeQuery (currElandQuery); currElandQuery = NULL; AllocVar (currElandQuery); w = wordIterCreate (line,"\t",0); currElandQuery->sequenceName = hlr_strdup (wordNext (w) + 1); // remove the '>' character at beginning of the line currElandQuery->sequence = hlr_strdup (wordNext (w)); currElandQuery->matchCode = hlr_strdup (wordNext (w)); if (strEqual (currElandQuery->matchCode,"QC")) { wordIterDestroy (w); return currElandQuery; } currElandQuery->exactMatches = atoi (wordNext (w)); currElandQuery->oneErrorMatches = atoi (wordNext (w)); currElandQuery->twoErrorMatches = atoi (wordNext (w)); token = wordNext (w); if (token == NULL) { wordIterDestroy (w); return currElandQuery; } if (!(pos = strchr (token,'.'))) { die ("Expected '.' in chromosome name: %s",token); } *pos = '\0'; currElandQuery->chromosome = hlr_strdup (pos + 1); currElandQuery->position = atoi (wordNext (w)); token = wordNext (w); if (token[0] == 'F') { currElandQuery->strand = '+'; } else if (token[0] == 'R') { currElandQuery->strand = '-'; } wordIterDestroy (w); return currElandQuery; } elandParser_freeQuery (currElandQuery); currElandQuery = NULL; return currElandQuery; }
static void addSubstitution (Alteration *currAlteration, char* proteinSequenceBeforeIndel, char *proteinSequenceAfterIndel, int indelOffset) { int lengthBefore,lengthAfter; static Stringa buffer = NULL; int i; int diff; int index; stringCreateClear (buffer,100); index = ((currAlteration->relativePosition - 1) / 3); lengthBefore = strlen (proteinSequenceBeforeIndel); lengthAfter = strlen (proteinSequenceAfterIndel); diff = abs (lengthBefore - lengthAfter); if (lengthBefore < lengthAfter) { stringPrintf (buffer,"%d_%c->",index + 1,proteinSequenceBeforeIndel[index]); for (i = 0; i <= diff; i++) { stringAppendf (buffer,"%c",proteinSequenceAfterIndel[index + i]); } } else if (lengthBefore > lengthAfter) { stringPrintf (buffer,"%d_",index + 1); for (i = 0; i <= diff; i++) { stringAppendf (buffer,"%c",proteinSequenceBeforeIndel[index + i]); } stringAppendf (buffer,"->%c",proteinSequenceAfterIndel[index]); } else { stringPrintf (buffer,"%d_%s->",index,subString (proteinSequenceBeforeIndel,index - 1,index + (int)ceil ((double)indelOffset / 3))); stringAppendf (buffer,"%s",subString (proteinSequenceAfterIndel,index - 1,index + (int)ceil ((double)indelOffset / 3))); } currAlteration->substitution = hlr_strdup (string (buffer)); }
int main (int argc, char *argv[]) { MrfEntry *currEntry; char *targetName; int targetStart,targetEnd; WordIter w; int count=0; int currCount=0; if (argc != 2) { usage ("%s <targetName:targetStart-targetEnd>",argv[0]); } w = wordIterCreate (argv[1],":- ",0); targetName = hlr_strdup (wordNext (w)); targetStart = atoi (wordNext (w)); targetEnd = atoi (wordNext (w)); wordIterDestroy (w); mrf_init ("-"); while (currEntry = mrf_nextEntry ()) { currCount = processEntry (currEntry,targetName,targetStart,targetEnd); count = currCount+count; } printf("Count for %s:%d-%d = %d\n", targetName, targetStart, targetEnd, count); mrf_deInit (); hlr_free (targetName); return 0; }
int main (int argc, char *argv[]) { MrfEntry *currEntry; char *targetName; int targetStart,targetEnd; WordIter w; if (argc != 2) { usage ("%s <targetName:targetStart-targetEnd>",argv[0]); } w = wordIterCreate (argv[1],":- ",0); targetName = hlr_strdup (wordNext (w)); targetStart = atoi (wordNext (w)); targetEnd = atoi (wordNext (w)); wordIterDestroy (w); mrf_init ("-"); puts (mrf_writeHeader ()); while (currEntry = mrf_nextEntry ()) { processEntry (currEntry,targetName,targetStart,targetEnd); } mrf_deInit (); hlr_free (targetName); return 0; }
SEXP r_cgiParam(SEXP r_param, SEXP ignore_case, SEXP r_default) { if(r_param == R_NilValue) return(R_NilValue); char* name; Stringa value=stringCreate(16); char *param=cStr(r_param); char *str=NULL; SEXP res; int (*fPtr)(char*, char*); fPtr=cBool(ignore_case) ? &myStrCaseEqual : &myStrEqual; cgiGetInit(); while(name = cgiGetNext(value)) { if((*fPtr)(name, param)) { str=hlr_strdup(string(value)); break; } } stringDestroy(value); if(str) { return mkString(str); } else { return r_default; } }
static Fastq* fastq_processNextSequence (int freeMemory, int truncateName) { char *line; static Fastq* currFQ = NULL; int count; Seq* currSeq = NULL; if (ls_isEof (lsFastq)) { if (freeMemory) { fastq_freeFastq (currFQ); } return NULL; } count = 0; while ( (line=ls_nextLine (lsFastq)) && (count<4) ) { if (line[0] == '\0') { continue; } if (line[0] == '@') { if (freeMemory) { fastq_freeFastq (currFQ); } count++; AllocVar (currFQ); AllocVar (currFQ->seq); currSeq = currFQ->seq; currSeq->name = hlr_strdup (line + 1); if (truncateName) { currSeq->name = firstWordInLine (skipLeadingSpaces (currSeq->name)); } line = ls_nextLine (lsFastq); // reading sequence currSeq->sequence = hlr_strdup ( line ); currSeq->size = strlen (currSeq->sequence); count++; line = ls_nextLine (lsFastq); // reading quality ID if( line[0] != '+' ) die("Expected quality ID: '+' or '+%s'", currSeq->name ); count++; line = ls_nextLine (lsFastq); // reading quality currFQ->quality = hlr_strdup( line ); count++; } } ls_back (lsFastq,1); return currFQ; }
static void createGffEntry( Array gffEntries, MrfRead *currRead, int groupNumber ) { int i; MrfBlock *currBlock; GffEntry *currGffEntry; static Stringa buffer = NULL; stringCreateClear (buffer,100); for (i = 0; i < arrayMax (currRead->blocks); i++) { currBlock = arrp (currRead->blocks,i,MrfBlock); currGffEntry = arrayp (gffEntries,arrayMax (gffEntries),GffEntry); stringPrintf (buffer,"%s\tMRF\texon\t%d\t%d\t.\t.\t.\tTG%d", currBlock->targetName, currBlock->targetStart, currBlock->targetEnd, //currBlock->strand, groupNumber); currGffEntry->targetName = hlr_strdup (currBlock->targetName); currGffEntry->line = hlr_strdup (string (buffer)); } }
static char* lookUpTreeFam (Array kgTreeFams, char *transcript) { KgTreeFam testKGTF; int index; int foundIt; foundIt = 0; testKGTF.transcriptName = hlr_strdup (transcript); foundIt = arrayFind (kgTreeFams,&testKGTF,&index,(ARRAYORDERF)sortKgTreeFamsByTranscriptName); hlr_free (testKGTF.transcriptName); if (foundIt) { return arrp (kgTreeFams,index,KgTreeFam)->treeFamId; } return NULL; }
/** * Get the next BlastQuery. * @pre The module has been initialized using blastParser_init(). */ BlastQuery* blastParser_nextQuery (void) { char *line,*pos; static char *queryName = NULL; static char *prevBlastQueryName = NULL; static BlastQuery *currBlastQuery = NULL; int first; if (!ls_isEof (ls)) { blastParser_freeQuery (currBlastQuery); currBlastQuery = NULL; AllocVar (currBlastQuery); currBlastQuery->entries = arrayCreate (5,BlastEntry); first = 1; while (line = ls_nextLine (ls)) { if (line[0] == '\0') { continue; } pos = strchr (line,'\t'); *pos = '\0'; strReplace (&queryName,line); if (first == 1 || strEqual (prevBlastQueryName,queryName)) { blastParser_processLine (pos + 1,currBlastQuery); } else { ls_back (ls,1); return currBlastQuery; } if (first == 1) { currBlastQuery->qName = hlr_strdup (queryName); first = 0; } strReplace(&prevBlastQueryName,queryName); } if (first == 1) { return NULL; } else { return currBlastQuery; } } blastParser_freeQuery (currBlastQuery); currBlastQuery = NULL; return NULL; }
static void blastParser_processLine (char* line, BlastQuery* currBlastQuery) { WordIter w; BlastEntry *currEntry; currEntry = arrayp (currBlastQuery->entries,arrayMax (currBlastQuery->entries),BlastEntry); w = wordIterCreate (line,"\t",0); currEntry->tName = hlr_strdup (wordNext (w)); currEntry->percentIdentity = atof (wordNext (w)); currEntry->alignmentLength = atoi (wordNext (w)); currEntry->misMatches = atoi (wordNext (w)); currEntry->gapOpenings = atoi (wordNext (w)); currEntry->qStart = atoi (wordNext (w)); currEntry->qEnd = atoi (wordNext (w)); currEntry->tStart = atoi (wordNext (w)); currEntry->tEnd = atoi (wordNext (w)); currEntry->evalue = atof (wordNext (w)); currEntry->bitScore = atof (wordNext (w)); wordIterDestroy (w); }
void performSegmentation (Array tars, Array wigs, char* targetName, double threshold, int maxGap, int minRun) { Tar *currTar; Wig *currWig,*nextWig; int i,j,endPosition; int countBelowThreshold; i = 0; while (i < arrayMax (wigs)) { currWig = arrp (wigs,i,Wig); if (currWig->value < threshold) { i++; continue; } j = i + 1; endPosition = j; countBelowThreshold = 0; while (j < arrayMax (wigs)) { nextWig = arrp (wigs,j,Wig); if (nextWig->value < threshold) { countBelowThreshold++; if (countBelowThreshold >= maxGap) { break; } } else { countBelowThreshold = 0; endPosition = j; } j++; } if ((endPosition - 1 - currWig->position + 1) >= minRun) { currTar = arrayp (tars,arrayMax (tars),Tar); currTar->start = currWig->position; currTar->end = endPosition + 1; currTar->targetName = hlr_strdup (targetName); } i = j; } }
int main (int argc, char *argv[]) { GfrEntry *currGE; BLEntry *currBLE; BLEntry currQuery; FILE *fp; char *line; int count; int countRemoved; int index; WordIter w; Array blackList = arrayCreate(20, BLEntry); config *Conf; if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) { die("%s:\tCannot find .fusionseqrc: %s", argv[0], getenv("FUSIONSEQ_CONFPATH")); return EXIT_FAILURE; } if( confp_get( Conf, "ANNOTATION_DIR")==NULL ) { die("%s:\tCannot find ANNOTATION_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( Conf, "BLACKLIST_FILENAME")==NULL ) { die("%s:\tCannot find BLACKLIST_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } Stringa buffer=stringCreate( 100 ); stringPrintf( buffer, "%s/%s", confp_get( Conf, "ANNOTATION_DIR"), confp_get( Conf, "BLACKLIST_FILENAME") ); /* fp = fopen( string( buffer ), "r" ); if( !fp ) die("Unable to open file: %s", string(buffer)); stringDestroy( buffer ); */ // reading blacklist file LineStream ls = ls_createFromFile( string(buffer) ); while( line = ls_nextLine(ls) ) { w = wordIterCreate( line, "\t", 1); currBLE = arrayp( blackList, arrayMax(blackList), BLEntry); currBLE->gene1 = hlr_strdup ( wordNext(w) ); currBLE->gene2 = hlr_strdup ( wordNext(w) ); wordIterDestroy(w); } //fclose(fp); ls_destroy( ls ); stringDestroy( buffer ); arraySort( blackList, (ARRAYORDERF) sortBlackListByName1); // beginFiltering count = 0; countRemoved = 0; gfr_init ("-"); puts (gfr_writeHeader ()); while (currGE = gfr_nextEntry ()) { // reading the gfr if( currGE->geneSymbolTranscript1 == NULL ) { die("Gene symbols are not present in the GFR file. Please run gfrAddInfo before gfrBlackListFilter."); return EXIT_FAILURE; } // creating a new query to the black list currQuery.gene1 = currGE->geneSymbolTranscript1; currQuery.gene2 = currGE->geneSymbolTranscript2; if( strEqual( currQuery.gene1 , currQuery.gene2 ) ) { countRemoved++; continue; } // searching against read_1/read_2 int res = arrayFind( blackList, &currQuery, &index, (ARRAYORDERF) sortBlackListByName1); if( !res ) { // not found, then searching against read_2/read_1 currQuery.gene1 = currGE->geneSymbolTranscript2; currQuery.gene2 = currGE->geneSymbolTranscript1; res = arrayFind( blackList, &currQuery, &index, (ARRAYORDERF) sortBlackListByName1 ); if( !res ) { // not found, write the instance to stdout, update the counts puts (gfr_writeGfrEntry (currGE)); count++; } else { // found: read2/read1 countRemoved++; } } else { //found: read1/read2 countRemoved++; } } gfr_deInit (); arrayDestroy( blackList ); warn ("%s_BlackListFilter: %s",argv[0], confp_get( Conf, "BLACKLIST_FILENAME")); warn ("%s_numRemoved: %d",argv[0],countRemoved); warn ("%s_numGfrEntries: %d",argv[0],count); confp_close( Conf); return 0; }
int main (int argc, char *argv[]) { Array kgXrefs; Stringa buffer; LineStream ls; int count=0; char* geneSymbolTranscript; char* descriptionTranscript; char* line; char* exonID = NULL; config *conf; if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) return EXIT_FAILURE; buffer = stringCreate (100); stringPrintf (buffer,"%s/%s", confp_get(conf, "ANNOTATION_DIR"), confp_get(conf, "KNOWN_GENE_XREF_FILENAME")); kgXrefs = util_readKnownGeneXrefs (string (buffer)); arraySort (kgXrefs,(ARRAYORDERF)sortKgXrefsByTranscriptName); stringDestroy (buffer); // gfr_init ("-"); ls = ls_createFromFile("-"); while (line = ls_nextLine(ls)) { char *lineP = hlr_strdup(line); WordIter w = wordIterCreate( line, "\t", 0); char *nameTranscript = wordNext( w ); char *p = rindex(nameTranscript, '_'); if (p) { exonID = hlr_strdup( p+1 ); *p='\0'; } transcript2geneSymbolAndGeneDescription(kgXrefs, nameTranscript, &geneSymbolTranscript, &descriptionTranscript); if (exonID) { printf("%s_%s\t%s\t%s\t%s", nameTranscript, exonID, geneSymbolTranscript, exonID, descriptionTranscript); hlr_free(exonID); } else { printf("%s\t%s\t1\t%s", nameTranscript, geneSymbolTranscript, descriptionTranscript); } printf("%s\n", lineP+strlen(nameTranscript)); count++; hlr_free(lineP); wordIterDestroy(w); } ls_destroy (ls); warn ("%s_numGfrEntries: %d",argv[0],count); confp_close(conf); return EXIT_SUCCESS; }
int main (int argc, char *argv[]) { GfrEntry *currGE; int i,j,k,l, h,index; Stringa buffer,cmd,fnSequencesToAlign; FILE *fp; FILE *fp1; FILE *fp2; FILE *freads1; FILE *freads2; Array gfrEntries; BowtieQuery *currBQ,testBQ; BowtieEntry *currBE; Texta seqNames; int readSize1, readSize2, minReadSize; Array bowtieQueries; char transcriptNumber; int isHomologous,homologousCount; int count; int countRemoved; unsigned short int tooMany; BlatQuery *blQ; config *conf; if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) { die("%s:\tCannot find .fusionseqrc", argv[0]); return EXIT_FAILURE; } if ( (confp_get( conf, "BLAT_TWO_BIT_TO_FA")) == NULL) { die("%s:\tCannot find BLAT_TWO_BIT_TO_FA in the configuration file: %s", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if ( (confp_get( conf,"BLAT_DATA_DIR")) == NULL) { die("%s:\tCannot find BLAT_DATA_DIR in the configuration file: %sc", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "TMP_DIR")==NULL ) { die("%s:\tCannot find TMP_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "BLAT_GFSERVER")==NULL ) { die("%s:\tCannot find BLAT_GFSERVER in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "BLAT_GFCLIENT")==NULL ) { die("%s:\tCannot find BLAT_GFCLIENT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "BLAT_GFSERVER_HOST")==NULL ) { die("%s:\tCannot find BLAT_GFSERVER_HOST in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; }if( confp_get( conf, "BLAT_GFSERVER_PORT")==NULL ) { die("%s:\tCannot find BLAT_GFSERVER_PORT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "PSEUDOGENE_DIR")==NULL ) { die("%s:\tCannot find PSEUDOGENE_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "PSEUDOGENE_FILENAME")==NULL ) { die("%s:\tCannot find PSEUDOGENE_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } cmd = stringCreate (100); // initializing the gfServers stringPrintf( cmd, "%s status %s %s &> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT") ); int ret = hlr_system( string(cmd), 1 ); if( ret != 0 ) { // not initialized stringPrintf( cmd , "%s -repMatch=100000 -tileSize=12 -canStop -log=%s/gfServer_genome.log start %s %s %s/%s &", confp_get( conf, "BLAT_GFSERVER"), confp_get(conf, "TMP_DIR"),confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"), confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME")); hlr_system( string( cmd ), 0 ); long int startTime = time(0); stringPrintf( cmd , "%s status %s %s &2> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT")); while( hlr_system( string(cmd), 1) && (time(0)-startTime)<600 ) ; if( hlr_system( string(cmd), 1 ) != 0 ) { die("gfServer for %s/%s not initialized: %s %s %s", confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT")); return EXIT_FAILURE; } } // end initialization gfr_init ("-"); gfrEntries = gfr_parse (); if (arrayMax (gfrEntries) == 0){ puts (gfr_writeHeader ()); gfr_deInit (); return 0; } seqNames = textCreate (10000); buffer = stringCreate (100); fnSequencesToAlign = stringCreate (100); count = 0; countRemoved = 0; stringPrintf( buffer, "%s/%s", confp_get( conf, "PSEUDOGENE_DIR"), confp_get( conf, "PSEUDOGENE_FILENAME") ); intervalFind_addIntervalsToSearchSpace (string(buffer),0); puts (gfr_writeHeader ()); for (i = 0; i < arrayMax (gfrEntries); i++) { currGE = arrp (gfrEntries,i,GfrEntry); homologousCount = 0; minReadSize=10000; // creating two fasta files with the two genes stringPrintf( cmd, "%s %s/%s -seq=%s -start=%d -end=%d %s/%s_transcript1.fa", confp_get(conf, "BLAT_TWO_BIT_TO_FA") , confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), currGE->chromosomeTranscript1, currGE->startTranscript1, currGE->endTranscript1, confp_get(conf, "TMP_DIR"), currGE->id); hlr_system( string(cmd) , 0); stringPrintf( cmd, "%s %s/%s -seq=%s -start=%d -end=%d %s/%s_transcript2.fa", confp_get(conf, "BLAT_TWO_BIT_TO_FA"), confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), currGE->chromosomeTranscript2, currGE->startTranscript2, currGE->endTranscript2, confp_get(conf, "TMP_DIR"), currGE->id); hlr_system( string(cmd) , 0); Stringa fa1 = stringCreate( 100 ); Stringa fa2 = stringCreate( 100 ); // creating the two fasta files with the reads stringPrintf( fa1, "%s/%s_reads1.fa", confp_get(conf, "TMP_DIR"), currGE->id); if (!(freads1 = fopen ( string(fa1) ,"w"))) { die ("Unable to open file: %s",string (fa1)); } // writing the reads of the first end into file for (l = 0; l < arrayMax (currGE->readsTranscript1); l++) { char* currRead1 = hlr_strdup( textItem (currGE->readsTranscript1,l)); // read1 readSize1 = strlen( currRead1 ); if( readSize1 == 0 ) die("Read size cannot be zero: read1[ %s ]", currRead1); if( readSize1 < minReadSize ) minReadSize = readSize1; fprintf( freads1, ">%d\n%s\n", l, currRead1 ); hlr_free( currRead1 ); } fclose( freads1 ); stringPrintf( fa2, "%s/%s_reads2.fa", confp_get(conf, "TMP_DIR"), currGE->id); if (!(freads2 = fopen ( string(fa2) ,"w"))) { die ("Unable to open file: %s",string (fa2)); } // writing the reads of the second end into file for (l = 0; l < arrayMax (currGE->readsTranscript2); l++) { char* currRead2 = hlr_strdup( textItem (currGE->readsTranscript2,l)); // read2 readSize2 = strlen( currRead2 ); if( readSize2 == 0 ) die("Read size cannot be zero: read2[ %s ]", currRead2); if( readSize2 < minReadSize ) minReadSize = readSize2; fprintf( freads2, ">%d\n%s\n", l, currRead2 ); hlr_free( currRead2 ); } fclose( freads2 ); // collapse the reads 2 ## requires the FASTX package stringPrintf( cmd, "%s -i %s/%s_reads2.fa -o %s/%s_reads2.collapsed.fa", confp_get(conf, "FASTX_COLLAPSER"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id ); hlr_system (string (cmd),0); //blat of reads2 against the first transcript stringPrintf( cmd, "%s -t=dna -out=psl -fine -tileSize=15 %s/%s_transcript1.fa %s/%s_reads2.collapsed.fa stdout",confp_get(conf, "BLAT_BLAT"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id ); // reading the results of blast from Pipe blatParser_initFromPipe( string(cmd) ); while( blQ = blatParser_nextQuery() ) { int nucleotideOverlap = getNucleotideOverlap ( blQ ); if ( nucleotideOverlap > ( ((double)readSize2)* atof(confp_get(conf,"MAX_OVERLAP_ALLOWED"))) ) { char* value = strchr(blQ->qName,'-'); homologousCount+=atoi(value+1); } } blatParser_deInit(); // collapse the reads 1 ## requires the FASTX package on the path stringPrintf( cmd, "%s -i %s/%s_reads1.fa -o %s/%s_reads1.collapsed.fa", confp_get(conf, "FASTX_COLLAPSER"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id ); hlr_system (string (cmd),0); //blat of reads1 against the second transcript stringPrintf( cmd, "%s -t=dna -out=psl -fine -tileSize=15 %s/%s_transcript2.fa %s/%s_reads1.collapsed.fa stdout",confp_get(conf, "BLAT_BLAT"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id ); blatParser_initFromPipe( string(cmd) ); while( blQ = blatParser_nextQuery() ) { int nucleotideOverlap = getNucleotideOverlap ( blQ ); if ( nucleotideOverlap > ( ((double)readSize1)* atof(confp_get(conf,"MAX_OVERLAP_ALLOWED"))) ) { char* value = strchr(blQ->qName,'-'); homologousCount+=atoi(value+1); } } blatParser_deInit(); stringPrintf (cmd,"cd %s;rm -rf %s_reads?.fa %s_reads?.collapsed.fa %s_transcript?.fa", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id); hlr_system( string(cmd) , 0); if (((double)homologousCount / (double)arrayMax(currGE->readsTranscript1)) <= atof(confp_get(conf, "MAX_FRACTION_HOMOLOGOUS")) ) { homologousCount = 0; // there is no homology between the two genes, but what about the rest of the genome writeFasta( currGE, &minReadSize, confp_get(conf, "TMP_DIR") ); stringPrintf(cmd, "cd %s; %s %s %s / -t=dna -q=dna -minScore=%d -out=psl %s_reads.fa %s.smallhomology.psl &>/dev/null", confp_get(conf, "TMP_DIR"), confp_get( conf, "BLAT_GFCLIENT"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"), minReadSize - (int)(0.1 * minReadSize) > 20 ? minReadSize - (int) (0.1 * minReadSize) : 20 , currGE->id, currGE->id); int attempts=0; ret = hlr_system( string(cmd), 1 ); while( hlr_system( string(cmd), 1 ) && attempts<5000 ) attempts++; if( attempts == 5000 ) { die("Cannot map the reads %s", string( cmd )); return EXIT_FAILURE; } // reading the results of blast from File stringPrintf(cmd, "%s/%s.smallhomology.psl", confp_get( conf, "TMP_DIR"), currGE->id); blatParser_initFromFile( string(cmd) ); tooMany = 1; while( blQ = blatParser_nextQuery() ) { tooMany = 0; checkPseudogeneOverlap( blQ ); if( arrayMax( blQ->entries ) > 1 ) { homologousCount+= arrayMax( blQ->entries ) - 1; char* value = strchr( blQ->qName,'/' ); if( value ) *value = '\0'; else die("Not a valid index in the blat query name:\t%s", blQ->qName ); int indexOfInter = atoi( blQ->qName ); // the following three lines should removed the read if writing the GFR entry GfrInterRead *currGIR = arrp( currGE->interReads, indexOfInter, GfrInterRead ); currGIR->flag = 1; } } blatParser_deInit(); if ( tooMany == 1 || ( ( (double) homologousCount / (double) ( arrayMax(currGE->readsTranscript1) + arrayMax(currGE->readsTranscript2) ) ) > atof(confp_get(conf, "MAX_FRACTION_HOMOLOGOUS")) ) ) { countRemoved++; stringPrintf (cmd,"cd %s; rm -rf %s_reads*.fa %s_reads?.collapsed.fa %s_transcript?.fa %s.smallhomology.psl", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id,currGE->id); hlr_system( string(cmd), 1 ); continue; } // writing the gfrEntry, if everthing else didn't stop if( homologousCount > 0 ) updateStats( currGE ); puts (gfr_writeGfrEntry (currGE)); count++; // removing temporary files stringPrintf (cmd,"cd %s;rm -rf %s_reads*.fa %s_reads?.collapsed.fa %s_transcript?.fa %s.smallhomology.psl", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id,currGE->id); hlr_system( string(cmd) , 1); } else { countRemoved++; } } gfr_deInit (); stringDestroy (fnSequencesToAlign); stringDestroy (cmd); stringDestroy (buffer); warn ("%s_numRemoved: %d",argv[0],countRemoved); warn ("%s_numGfrEntries: %d",argv[0],count); confp_close(conf); return EXIT_SUCCESS; }
int main (int argc, char *argv[]) { char *queryString; if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) return EXIT_FAILURE; cgiInit(); cgiHeader("text/html"); queryString = cgiGet2Post(); if (queryString[0] == '\0') { puts ("<html>"); puts ("<head>"); html_printGenericStyleSheet (12); puts ("<title>geneFusions</title>\n"); puts ("</head>"); puts ("<body>"); puts ("<h1>Identification of potential gene fusions using paired-end reads</h1><br><br>"); printf ("<form action=%s/geneFusions_cgi method=get>", confp_get(Conf, "WEB_URL_CGI")); puts ("<b>Data prefix</b>: "); puts ("<input type=text name=prefix>"); puts ("<br><br><br>"); puts ("<b>Minimum number of paired-end reads connecting two genes</b>: "); puts ("<select name=minNum>"); puts ("<option value=2>2"); puts ("<option value=3>3"); puts ("<option value=5 selected>5"); puts ("<option value=10>10"); puts ("</select>"); puts ("<br><br><br>"); puts ("<b>Type of gene fusion</b>: "); puts ("<select name=type>"); puts ("<option value=read-through>Read-through events"); puts ("<option value=cis>Cis events"); puts ("<option value=intra>Intra-chromosomal events"); puts ("<option value=same>Genes on the same chromosome"); puts ("<option value=inter>Genes on different chromosomes"); puts ("<option value=all selected>All potential gene fusions"); puts ("</select>"); puts ("<br><br><br>"); puts ("<input type=submit value=Submit>"); puts ("<input type=reset value=Reset>"); puts ("</form>"); puts ("</body>"); puts ("</html>"); fflush (stdout); } else { int first; Stringa item = stringCreate (20); Stringa value = stringCreate (20); char *iPtr,*vPtr,*prefix,*type; int minNum; first = 1; cgiGetInit (); while (cgiGetNextPair (&first,item,value)) { iPtr = string (item); vPtr = string (value); if (strEqual (iPtr,"prefix")) { prefix = hlr_strdup (vPtr); } if (strEqual (iPtr,"type")) { type = hlr_strdup (vPtr); } if (strEqual (iPtr,"minNum")) { minNum = atoi (vPtr); } } generateOutput (prefix,type,minNum); } confp_close(Conf); return EXIT_SUCCESS; }
int main (int argc, char *argv[]) { Array intervals; Interval *currInterval; SubInterval *currSubInterval; int h,i,j; Array seqs; Seq *currSeq,testSeq; int index; Stringa buffer; Array geneTranscriptEntries; Texta geneTranscriptIds; Array alterations; Alteration *currAlteration,*nextAlteration; char *proteinSequenceBeforeIndel; char *proteinSequenceAfterIndel; int numDisabledTranscripts; Stringa disabledTranscripts; int seqLength,refLength,altLength; char *sequenceBeforeIndel = NULL; int overlapMode; int numOverlaps; int sizeIndel,indelOffset; int overlap; Array coordinates; VcfEntry *currVcfEntry; VcfGenotype *currVcfGenotype; int position; Texta alternateAlleles; int flag1,flag2; if (argc != 3) { usage ("%s <annotation.interval> <annotation.fa>",argv[0]); } intervalFind_addIntervalsToSearchSpace (argv[1],0); geneTranscriptEntries = util_getGeneTranscriptEntries (intervalFind_getAllIntervals ()); seq_init (); fasta_initFromFile (argv[2]); seqs = fasta_readAllSequences (0); fasta_deInit (); arraySort (seqs,(ARRAYORDERF)util_sortSequencesByName); buffer = stringCreate (100); disabledTranscripts = stringCreate (100); alterations = arrayCreate (100,Alteration); vcf_init ("-"); stringPrintf (buffer,"##INFO=<ID=VA,Number=.,Type=String,Description=\"Variant Annotation, %s\">",argv[1]); vcf_addComment (string (buffer)); puts (vcf_writeMetaData ()); puts (vcf_writeColumnHeaders ()); while (currVcfEntry = vcf_nextEntry ()) { if (vcf_isInvalidEntry (currVcfEntry)) { continue; } flag1 = 0; flag2 = 0; position = currVcfEntry->position - 1; // make zero-based alternateAlleles = vcf_getAlternateAlleles (currVcfEntry); for (h = 0; h < arrayMax (alternateAlleles); h++) { refLength = strlen (currVcfEntry->referenceAllele); altLength = strlen (textItem (alternateAlleles,h)); sizeIndel = abs (refLength - altLength); indelOffset = MAX (refLength,altLength) - 1; util_clearAlterations (alterations); intervals = intervalFind_getOverlappingIntervals (currVcfEntry->chromosome,position,position + indelOffset); for (i = 0; i < arrayMax (intervals); i++) { currInterval = arru (intervals,i,Interval*); overlapMode = OVERLAP_NONE; numOverlaps = 0; for (j = 0; j < arrayMax (currInterval->subIntervals); j++) { currSubInterval = arrp (currInterval->subIntervals,j,SubInterval); overlap = rangeIntersection (position,position + indelOffset,currSubInterval->start,currSubInterval->end); if (currSubInterval->start <= position && (position + indelOffset) < currSubInterval->end) { overlapMode = OVERLAP_FULLY_CONTAINED; numOverlaps++; } else if (j == 0 && overlap > 0 && position < currSubInterval->start) { overlapMode = OVERLAP_START; numOverlaps++; } else if (j == (arrayMax (currInterval->subIntervals) - 1) && overlap > 0 && (position + indelOffset) >= currSubInterval->end) { overlapMode = OVERLAP_END; numOverlaps++; } else if (overlap > 0 && overlap <= indelOffset) { overlapMode = OVERLAP_SPLICE; numOverlaps++; } } if (overlapMode == OVERLAP_NONE) { continue; } currAlteration = arrayp (alterations,arrayMax (alterations),Alteration); if (numOverlaps > 1) { util_addAlteration (currAlteration,currInterval->name,"multiExonHit",currInterval,position,0); continue; } else if (numOverlaps == 1 && overlapMode == OVERLAP_SPLICE) { util_addAlteration (currAlteration,currInterval->name,"spliceOverlap",currInterval,position,0); continue; } else if (numOverlaps == 1 && overlapMode == OVERLAP_START) { util_addAlteration (currAlteration,currInterval->name,"startOverlap",currInterval,position,0); continue; } else if (numOverlaps == 1 && overlapMode == OVERLAP_END) { util_addAlteration (currAlteration,currInterval->name,"endOverlap",currInterval,position,0); continue; } else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength > refLength) { if ((sizeIndel % 3) == 0) { util_addAlteration (currAlteration,currInterval->name,"insertionNFS",currInterval,position,0); } else { util_addAlteration (currAlteration,currInterval->name,"insertionFS",currInterval,position,0); } } else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength < refLength) { if ((sizeIndel % 3) == 0) { util_addAlteration (currAlteration,currInterval->name,"deletionNFS",currInterval,position,0); } else { util_addAlteration (currAlteration,currInterval->name,"deletionFS",currInterval,position,0); } } else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength == refLength) { util_addAlteration (currAlteration,currInterval->name,"substitution",currInterval,position,0); } else { die ("Unexpected type: %d %s %s %s", currVcfEntry->position,currVcfEntry->chromosome, currVcfEntry->referenceAllele,currVcfEntry->alternateAllele); } if ((sizeIndel % 3) != 0 && altLength != refLength) { continue; } // Only run the remaining block of code if the indel is fully contained (insertion or deletion) AND does not cause a frameshift OR // if it is a substitution that is fully contained in the coding sequence stringPrintf (buffer,"%s|%s|%c|",currInterval->name,currInterval->chromosome,currInterval->strand); for (j = 0; j < arrayMax (currInterval->subIntervals); j++) { currSubInterval = arrp (currInterval->subIntervals,j,SubInterval); stringAppendf (buffer,"%d|%d%s",currSubInterval->start,currSubInterval->end,j < arrayMax (currInterval->subIntervals) - 1 ? "|" : ""); } testSeq.name = hlr_strdup (string (buffer)); if (!arrayFind (seqs,&testSeq,&index,(ARRAYORDERF)util_sortSequencesByName)) { die ("Expected to find %s in seqs",string (buffer)); } hlr_free (testSeq.name); currSeq = arrp (seqs,index,Seq); strReplace (&sequenceBeforeIndel,currSeq->sequence); seqLength = strlen (sequenceBeforeIndel); coordinates = util_getCoordinates (currInterval); // arraySort (coordinates,(ARRAYORDERF)util_sortCoordinatesByChromosomeAndTranscriptPosition); Array is already sorted by definition j = 0; stringClear (buffer); while (j < seqLength) { if (util_getGenomicCoordinate (coordinates,j,currVcfEntry->chromosome) == position) { if (altLength > refLength) { stringCat (buffer,textItem (alternateAlleles,h)); j++; continue; } else if (altLength < refLength) { stringCatChar (buffer,sequenceBeforeIndel[j]); j = j + refLength - altLength + 1; continue; } else { stringCat (buffer,textItem (alternateAlleles,h)); j = j + altLength; continue; } } stringCatChar (buffer,sequenceBeforeIndel[j]); j++; } util_destroyCoordinates (coordinates); proteinSequenceBeforeIndel = hlr_strdup (util_translate (currInterval,sequenceBeforeIndel)); proteinSequenceAfterIndel = hlr_strdup (util_translate (currInterval,string (buffer))); addSubstitution (currAlteration,proteinSequenceBeforeIndel,proteinSequenceAfterIndel,indelOffset); hlr_free (proteinSequenceBeforeIndel); hlr_free (proteinSequenceAfterIndel); } if (arrayMax (alterations) == 0) { continue; } arraySort (alterations,(ARRAYORDERF)util_sortAlterationsByGeneIdAndType); stringClear (buffer); i = 0; while (i < arrayMax (alterations)) { currAlteration = arrp (alterations,i,Alteration); stringAppendf (buffer,"%s%d:%s:%s:%c:%s",stringLen (buffer) == 0 ? "" : ",",h + 1,currAlteration->geneName,currAlteration->geneId,currAlteration->strand,currAlteration->type); stringClear (disabledTranscripts); if (currAlteration->substitution[0] != '\0') { stringAppendf (disabledTranscripts,"%s:%s:%d_%d_%s",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition,currAlteration->substitution); } else if (strEqual (currAlteration->type,"multiExonHit") || strEqual (currAlteration->type,"spliceOverlap") || strEqual (currAlteration->type,"startOverlap") || strEqual (currAlteration->type,"endOverlap")) { stringAppendf (disabledTranscripts,"%s:%s:%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength); } else { stringAppendf (disabledTranscripts,"%s:%s:%d_%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition); } numDisabledTranscripts = 1; j = i + 1; while (j < arrayMax (alterations)) { nextAlteration = arrp (alterations,j,Alteration); if (strEqual (currAlteration->geneId,nextAlteration->geneId) && strEqual (currAlteration->type,nextAlteration->type)) { if (nextAlteration->substitution[0] != '\0') { stringAppendf (disabledTranscripts,":%s:%s:%d_%d_%s",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition,nextAlteration->substitution); } else if (strEqual (nextAlteration->type,"multiExonHit") || strEqual (nextAlteration->type,"spliceOverlap") || strEqual (nextAlteration->type,"startOverlap") || strEqual (nextAlteration->type,"endOverlap")) { stringAppendf (disabledTranscripts,":%s:%s:%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength); } else { stringAppendf (disabledTranscripts,":%s:%s:%d_%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition); } numDisabledTranscripts++; } else { break; } j++; } i = j; geneTranscriptIds = util_getTranscriptIdsForGeneId (geneTranscriptEntries,currAlteration->geneId); stringAppendf (buffer,":%d/%d:%s",numDisabledTranscripts,arrayMax (geneTranscriptIds),string (disabledTranscripts)); } if (flag1 == 0) { printf ("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s;VA=", currVcfEntry->chromosome,currVcfEntry->position,currVcfEntry->id, currVcfEntry->referenceAllele,currVcfEntry->alternateAllele, currVcfEntry->quality,currVcfEntry->filter,currVcfEntry->info); flag1 = 1; } printf ("%s%s",flag2 == 1 ? "," : "",string (buffer)); flag2 = 1; } if (flag1 == 1) { for (i = 0; i < arrayMax (currVcfEntry->genotypes); i++) { currVcfGenotype = arrp (currVcfEntry->genotypes,i,VcfGenotype); if (i == 0) { printf ("\t%s\t",currVcfEntry->genotypeFormat); } printf ("%s%s%s%s",currVcfGenotype->genotype, currVcfGenotype->details[0] != '\0' ? ":" : "", currVcfGenotype->details[0] != '\0' ? currVcfGenotype->details : "", i < arrayMax (currVcfEntry->genotypes) - 1 ? "\t" : ""); } puts (""); } } vcf_deInit (); return 0; }
int main (int argc, char *argv[]) { GfrEntry *currGE; BLEntry *currBLE; BLEntry currQuery; FILE *fp; char *line; int count; int countRemoved; int index; WordIter w; Array blackList = arrayCreate(20, BLEntry); if (argc != 2) { usage ("%s <blackList.txt>",argv[0]); } fp = fopen( argv[1], "r" ); if( !fp ) die("Unable to open file: %s", argv[1]); // reading blacklist file LineStream ls = ls_createFromFile( argv[1] ); while( line = ls_nextLine(ls) ) { w = wordIterCreate( line, "\t", 1); currBLE = arrayp( blackList, arrayMax(blackList), BLEntry); currBLE->gene1 = hlr_strdup ( wordNext(w) ); currBLE->gene2 = hlr_strdup ( wordNext(w) ); wordIterDestroy(w); } fclose(fp); arraySort( blackList, (ARRAYORDERF) sortBlackListByName1); // beginFiltering count = 0; countRemoved = 0; gfr_init ("-"); puts (gfr_writeHeader ()); while (currGE = gfr_nextEntry ()) { // reading the gfr // creating a new query to the black list currQuery.gene1 = currGE->geneSymbolTranscript1; currQuery.gene2 = currGE->geneSymbolTranscript2; // searching against read_1/read_2 int res = arrayFind( blackList, &currQuery, &index, (ARRAYORDERF) sortBlackListByName1); if( !res ) { // not found, then searching against read_2/read_1 currQuery.gene1 = currGE->geneSymbolTranscript2; currQuery.gene2 = currGE->geneSymbolTranscript1; res = arrayFind( blackList, &currQuery, &index, (ARRAYORDERF) sortBlackListByName1 ); if( !res ) { // not found, write the instance to stdout, update the counts puts (gfr_writeGfrEntry (currGE)); count++; } else { // found: read2/read1 countRemoved++; } } else { //found: read1/read2 countRemoved++; } } gfr_deInit (); arrayDestroy( blackList ); warn ("%s_BlackListFilter: %s",argv[0], argv[1]); warn ("%s_numRemoved: %d",argv[0],countRemoved); warn ("%s_numGfrEntries: %d",argv[0],count); return 0; }