int main (int argc, char *argv[]) { GfrEntry *currGE; int count; int countRemoved; if (argc != 2) { usage ("%s <string>",argv[0]); } count = 0; countRemoved = 0; gfr_init ("-"); puts (gfr_writeHeader ()); while (currGE = gfr_nextEntry ()) { if (currGE->descriptionTranscript1 == NULL || currGE->descriptionTranscript2 == NULL) { die ("Transcript description is missing"); } if (strCaseStr (currGE->descriptionTranscript1,argv[1]) || strCaseStr (currGE->descriptionTranscript2,argv[1])) { countRemoved++; continue; } puts (gfr_writeGfrEntry (currGE)); count++; } gfr_deInit (); warn ("%s_string: %s",argv[0],argv[1]); warn ("%s_numRemoved: %d",argv[0],countRemoved); warn ("%s_numGfrEntries: %d",argv[0],count); return 0; }
int main (int argc, char *argv[]) { GfrEntry *currGE; int count; int countRemoved; int i; if (argc != 3) { usage ("%s <offsetCutoff> <minNumUniqueReads>",argv[0]); } count = 0; countRemoved = 0; int offsetCutOff = atoi (argv[1]); int minNumUniqueReads = atoi (argv[2]); gfr_init ("-"); puts (gfr_writeHeader ()); while (currGE = gfr_nextEntry ()) { Array starts = arrayCreate( 100, int); for (i = 0; i < arrayMax( currGE->interReads ); i++) { int currStart = arrp(currGE->interReads, i, GfrInterRead)->readStart1 + arrp(currGE->interReads, i, GfrInterRead)->readStart2; array(starts, arrayMax(starts), int) = currStart; } arraySort( starts, (ARRAYORDERF) arrayIntcmp ); arrayUniq( starts, NULL, (ARRAYORDERF) arrayIntcmp ) ; int numUniqeOffsets = arrayMax( starts ); arrayDestroy( starts ); if (arrayMax( currGE->readsTranscript1 ) != arrayMax( currGE->readsTranscript2 ) ) die( "The two ends have a different number of reads"); Texta reads = textCreate(arrayMax(currGE->readsTranscript1)); for (i = 0; i < arrayMax(currGE->readsTranscript1); i++) { Stringa strA = stringCreate( strlen(textItem( currGE->readsTranscript1, i) ) * 2 + 1); stringAppendf( strA, textItem( currGE->readsTranscript1,i)); stringAppendf( strA, textItem( currGE->readsTranscript2,i)); textAdd( reads, string(strA)); stringDestroy( strA ); } textUniqKeepOrder( reads ); int numRemaining = arrayMax( reads ); textDestroy ( reads ); if (numRemaining <= minNumUniqueReads || numUniqeOffsets <= offsetCutOff) { countRemoved++; continue; } puts (gfr_writeGfrEntry (currGE)); count++; } gfr_deInit (); warn("%s_PCRFilter: offset=%d minNumUniqueReads=%d", argv[0],offsetCutOff, minNumUniqueReads); warn("%s_numRemoved: %d",argv[0],countRemoved); warn("%s_numGfrEntries: %d",argv[0],count); return 0; }
int main (int argc, char *argv[]) { GfrEntry *currGE; int count; int countRemoved; int i, j; int foundEST; if (argc != 2) { usage ("%s <EST.interval>",argv[0]); } intervalFind_addIntervalsToSearchSpace( argv[1], 0); // beginFiltering count = 0; countRemoved = 0; gfr_init ("-"); puts (gfr_writeHeader ()); while (currGE = gfr_nextEntry ()) { // reading the gfr foundEST = 0; if( strEqual( currGE->fusionType, "cis" ) ) { if( ! strEqual( currGE->chromosomeTranscript1, currGE->chromosomeTranscript2 ) ) die("The two genes are not on the same chromosomes: %s - %s", currGE->chromosomeTranscript1, currGE->chromosomeTranscript2 ); int start1, end1, start2, end2; findCoordinates( currGE, &start1, &end1, &start2, &end2 ); Array intervals1 = arrayCopy( intervalFind_getOverlappingIntervals( currGE->chromosomeTranscript1, start1, end1 ) ); Array intervals2 = intervalFind_getOverlappingIntervals( currGE->chromosomeTranscript2, start2, end2 ); for( i=0; i<arrayMax( intervals1 ); i++ ) { Interval* currInterval1 = arru( intervals1, i, Interval* ); for( j=0; j<arrayMax ( intervals2 ); j++ ) { Interval* currInterval2 = arru( intervals2, j, Interval* ); if( currInterval1==currInterval2 ) { foundEST = 1; i = arrayMax( intervals1 ); j = arrayMax( intervals2 ); } } } arrayDestroy( intervals1 ); } if( foundEST ) countRemoved++; else { puts (gfr_writeGfrEntry (currGE)); count++; } } gfr_deInit (); warn ("%s_EST_data: %s",argv[0], argv[1]); warn ("%s_numRemoved: %d",argv[0], countRemoved); warn ("%s_numGfrEntries: %d",argv[0],count); return 0; }
int main (int argc, char *argv[]) { GfrEntry *currGE; int count; gfr_init ("-"); gfr_addNewColumnType (GFR_COLUMN_NAME_PAIR_COUNT); puts (gfr_writeHeader ()); count = 0; while (currGE = gfr_nextEntry ()){ obtainPairCounts (currGE); puts (gfr_writeGfrEntry (currGE)); fflush (stdout); count++; } gfr_deInit (); warn ("%s_numGfrEntries: %d",argv[0],count); return 0; }
int main (int argc, char *argv[]) { GfrEntry *currGE; Array kgTreeFams; Stringa buffer; int count; int countRemoved; config *conf; if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) return EXIT_FAILURE; buffer = stringCreate (100); stringPrintf (buffer,"%s/%s", confp_get(conf, "ANNOTATION_DIR"), confp_get(conf, "KNOWN_GENE_TREE_FAM_FILENAME")); kgTreeFams = util_readKnownGeneTreeFams (string (buffer)); arraySort (kgTreeFams,(ARRAYORDERF)sortKgTreeFamsByTranscriptName); stringDestroy (buffer); count = 0; countRemoved = 0; gfr_init ("-"); puts (gfr_writeHeader ()); while (currGE = gfr_nextEntry ()){ if (isHomologous (kgTreeFams,currGE->nameTranscript1,currGE->nameTranscript2)) { countRemoved++; continue; } puts (gfr_writeGfrEntry (currGE)); count++; } gfr_deInit (); warn ("%s_numRemoved: %d",argv[0],countRemoved); warn ("%s_numGfrEntries: %d",argv[0],count); confp_close(conf); return EXIT_SUCCESS; }
int main (int argc, char *argv[]) { GfrEntry *currGE; GfrInterRead *currGIR; int i; Stringa buffer; FILE *fp1,*fp2; int count; count = 0; buffer = stringCreate (100); gfr_init ("-"); puts (gfr_writeHeader ()); while (currGE = gfr_nextEntry ()) { stringPrintf (buffer,"%s_1.bed",currGE->id); fp1 = fopen (string (buffer),"w"); stringPrintf (buffer,"%s_2.bed",currGE->id); fp2 = fopen (string (buffer),"w"); if (fp1 == NULL || fp2 == NULL) { die ("Unable to open BED files"); } fprintf (fp1,"browser full knownGene\n"); fprintf (fp1,"track name=\"Inter paird-ends: %s_1\" visibility=2\n",currGE->id); fprintf (fp2,"browser full knownGene\n"); fprintf (fp2,"track name=\"Inter paird-ends: %s_2\" visibility=2\n",currGE->id); for (i = 0; i < arrayMax (currGE->interReads); i++) { currGIR = arrp (currGE->interReads,i,GfrInterRead); fprintf (fp1,"%s\t%d\t%d\n",currGE->chromosomeTranscript1,currGIR->readStart1,currGIR->readEnd1); fprintf (fp2,"%s\t%d\t%d\n",currGE->chromosomeTranscript2,currGIR->readStart2,currGIR->readEnd2); } fclose (fp1); fclose (fp2); puts (gfr_writeGfrEntry (currGE)); count++; } gfr_deInit (); stringDestroy (buffer); warn ("%s_numGfrEntries: %d",argv[0],count); return 0; }
int main (int argc, char *argv[]) { GfrEntry *currGE; int count; int countRemoved; int mitochondrialCount; unsigned int minReadSize; int i; Stringa cmd; BlatQuery *blQ=NULL; config *conf = NULL; /**< Pointer to configuration file .fusionseqrc */ if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) { die("%s:\tCannot find .fusionseqrc: %s", argv[0], getenv("FUSIONSEQ_CONFPATH")); return EXIT_FAILURE; } if( confp_get( conf,"MAX_OVERLAP_ALLOWED")==NULL ) { die("%s:\tCannot find MAX_OVERLAP_ALLOWED in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf,"MAX_FRACTION_HOMOLOGOUS")==NULL ) { die("%s:\tCannot find MAX_FRACTION_HOMOLOGOUS in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "MITOCHONDRIAL_DIR")==NULL ) { die("%s:\tCannot find MITOCHONDRIAL_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "MITOCHONDRIAL_FILENAME")==NULL ) { die("%s:\tCannot find MITOCHONDRIAL_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "TMP_DIR")==NULL ) { die("%s:\tCannot find TMP_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "BLAT_GFSERVER")==NULL ) { die("%s:\tCannot find BLAT_GFSERVER in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "BLAT_GFCLIENT")==NULL ) { die("%s:\tCannot find BLAT_GFCLIENT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "BLAT_GFSERVER_HOST")==NULL ) { die("%s:\tCannot find BLAT_GFSERVER_HOST in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; }if( confp_get( conf, "BLAT_GFSERVER_PORT")==NULL ) { die("%s:\tCannot find BLAT_GFSERVER_PORT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } count = 0; countRemoved = 0; cmd = stringCreate (100); // initializing the gfServers stringPrintf( cmd, "%s status %s %d &> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2); int ret = hlr_system( string(cmd), 1 ); if( ret != 0 ) { // not initialized stringPrintf( cmd , "%s -repMatch=100000 -tileSize=12 -canStop -log=%s/gfServer_mitochondrial.log start %s %d %s/%s &", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "TMP_DIR"), confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2, confp_get( conf, "MITOCHONDRIAL_DIR"), confp_get( conf,"MITOCHONDRIAL_FILENAME")); hlr_system( string( cmd ), 0 ); long int startTime = time(0); stringPrintf( cmd , "%s status %s %d &> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2); while( hlr_system( string(cmd), 1) && (time(0)-startTime)<600 ) ; if( hlr_system( string(cmd), 1 ) != 0 ) { die("gfServer for %s/%s not initialized: %s %s %s", confp_get( conf, "MITOCHONDRIAL_DIR"), confp_get( conf, "MITOCHONDRIAL_FILENAME"), confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT")); return EXIT_FAILURE; } } gfr_init ("-"); puts (gfr_writeHeader ()); while (currGE = gfr_nextEntry ()) { if (strEqual(currGE->chromosomeTranscript1, "chrM") || strEqual(currGE->chromosomeTranscript2, "chrM")) { countRemoved++; continue; } else { mitochondrialCount = 0; minReadSize=1000; writeFasta( currGE, &minReadSize, confp_get( conf, "TMP_DIR") ); // in util.c stringPrintf(cmd, "cd %s;%s %s %d / -t=dna -q=dna -minScore=%d -out=psl %s_reads.fa %s.mito.psl &>/dev/null", confp_get( conf, "TMP_DIR"), confp_get( conf, "BLAT_GFCLIENT"), confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2, minReadSize - 5 > 20 ? minReadSize - 5 : 20 , currGE->id, currGE->id); int attempts=0; ret = hlr_system( string(cmd), 1 ); while( hlr_system( string(cmd), 1 ) && attempts<5000 ) attempts++; if( attempts == 5000 ) { die("Cannot map the reads %s", string( cmd )); return EXIT_FAILURE; } // reading the results of blast from File stringPrintf(cmd, "%s/%s.mito.psl", confp_get( conf, "TMP_DIR"), currGE->id); blatParser_initFromFile( string(cmd) ); while( blQ = blatParser_nextQuery() ) { //warn("iter %d\tquery %s", iter, blQ->qName );iter++; int nucleotideOverlap = getNucleotideOverlap ( blQ ); if (nucleotideOverlap > (((double) minReadSize) * strtod(confp_get( conf, "MAX_OVERLAP_ALLOWED"), NULL))) { char* value = strchr( blQ->qName,'/' ); if( value ) *value = '\0'; else die("Not a valid index in the blat query name:\t%s", blQ->qName ); int indexOfInter = atoi( blQ->qName ); // the following three lines should removed the read if writing the GFR entry GfrInterRead *currGIR = arrp( currGE->interReads, indexOfInter, GfrInterRead ); currGIR->flag = 1; mitochondrialCount++; } } blatParser_deInit(); if ( ( (double) mitochondrialCount / (double) ( arrayMax(currGE->readsTranscript1) + arrayMax(currGE->readsTranscript2) ) ) <= strtod(confp_get( conf, "MAX_FRACTION_HOMOLOGOUS"), NULL)) { if( mitochondrialCount > 0 ) updateStats( currGE ); // writing the gfrEntry puts (gfr_writeGfrEntry (currGE)); count++; } else { countRemoved++; } // removing temporary files stringPrintf (cmd,"rm -rf %s/%s_reads.fa %s/%s.mito.psl", confp_get( conf, "TMP_DIR"), currGE->id, confp_get( conf, "TMP_DIR"), currGE->id ); hlr_system( string(cmd) , 1); } } gfr_deInit (); stringDestroy( cmd ); warn ("%s_numRemoved: %d",argv[0],countRemoved); warn ("%s_numGfrEntries: %d",argv[0],count); confp_close(conf); return 0; }
int main (int argc, char *argv[]) { GfrEntry *currGE; int i,j,k,l, h,index; Stringa buffer,cmd,fnSequencesToAlign; FILE *fp; FILE *fp1; FILE *fp2; FILE *freads1; FILE *freads2; Array gfrEntries; BowtieQuery *currBQ,testBQ; BowtieEntry *currBE; Texta seqNames; int readSize1, readSize2, minReadSize; Array bowtieQueries; char transcriptNumber; int isHomologous,homologousCount; int count; int countRemoved; unsigned short int tooMany; BlatQuery *blQ; config *conf; if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) { die("%s:\tCannot find .fusionseqrc", argv[0]); return EXIT_FAILURE; } if ( (confp_get( conf, "BLAT_TWO_BIT_TO_FA")) == NULL) { die("%s:\tCannot find BLAT_TWO_BIT_TO_FA in the configuration file: %s", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if ( (confp_get( conf,"BLAT_DATA_DIR")) == NULL) { die("%s:\tCannot find BLAT_DATA_DIR in the configuration file: %sc", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "TMP_DIR")==NULL ) { die("%s:\tCannot find TMP_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "BLAT_GFSERVER")==NULL ) { die("%s:\tCannot find BLAT_GFSERVER in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "BLAT_GFCLIENT")==NULL ) { die("%s:\tCannot find BLAT_GFCLIENT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "BLAT_GFSERVER_HOST")==NULL ) { die("%s:\tCannot find BLAT_GFSERVER_HOST in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; }if( confp_get( conf, "BLAT_GFSERVER_PORT")==NULL ) { die("%s:\tCannot find BLAT_GFSERVER_PORT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "PSEUDOGENE_DIR")==NULL ) { die("%s:\tCannot find PSEUDOGENE_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( conf, "PSEUDOGENE_FILENAME")==NULL ) { die("%s:\tCannot find PSEUDOGENE_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } cmd = stringCreate (100); // initializing the gfServers stringPrintf( cmd, "%s status %s %s &> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT") ); int ret = hlr_system( string(cmd), 1 ); if( ret != 0 ) { // not initialized stringPrintf( cmd , "%s -repMatch=100000 -tileSize=12 -canStop -log=%s/gfServer_genome.log start %s %s %s/%s &", confp_get( conf, "BLAT_GFSERVER"), confp_get(conf, "TMP_DIR"),confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"), confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME")); hlr_system( string( cmd ), 0 ); long int startTime = time(0); stringPrintf( cmd , "%s status %s %s &2> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT")); while( hlr_system( string(cmd), 1) && (time(0)-startTime)<600 ) ; if( hlr_system( string(cmd), 1 ) != 0 ) { die("gfServer for %s/%s not initialized: %s %s %s", confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT")); return EXIT_FAILURE; } } // end initialization gfr_init ("-"); gfrEntries = gfr_parse (); if (arrayMax (gfrEntries) == 0){ puts (gfr_writeHeader ()); gfr_deInit (); return 0; } seqNames = textCreate (10000); buffer = stringCreate (100); fnSequencesToAlign = stringCreate (100); count = 0; countRemoved = 0; stringPrintf( buffer, "%s/%s", confp_get( conf, "PSEUDOGENE_DIR"), confp_get( conf, "PSEUDOGENE_FILENAME") ); intervalFind_addIntervalsToSearchSpace (string(buffer),0); puts (gfr_writeHeader ()); for (i = 0; i < arrayMax (gfrEntries); i++) { currGE = arrp (gfrEntries,i,GfrEntry); homologousCount = 0; minReadSize=10000; // creating two fasta files with the two genes stringPrintf( cmd, "%s %s/%s -seq=%s -start=%d -end=%d %s/%s_transcript1.fa", confp_get(conf, "BLAT_TWO_BIT_TO_FA") , confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), currGE->chromosomeTranscript1, currGE->startTranscript1, currGE->endTranscript1, confp_get(conf, "TMP_DIR"), currGE->id); hlr_system( string(cmd) , 0); stringPrintf( cmd, "%s %s/%s -seq=%s -start=%d -end=%d %s/%s_transcript2.fa", confp_get(conf, "BLAT_TWO_BIT_TO_FA"), confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), currGE->chromosomeTranscript2, currGE->startTranscript2, currGE->endTranscript2, confp_get(conf, "TMP_DIR"), currGE->id); hlr_system( string(cmd) , 0); Stringa fa1 = stringCreate( 100 ); Stringa fa2 = stringCreate( 100 ); // creating the two fasta files with the reads stringPrintf( fa1, "%s/%s_reads1.fa", confp_get(conf, "TMP_DIR"), currGE->id); if (!(freads1 = fopen ( string(fa1) ,"w"))) { die ("Unable to open file: %s",string (fa1)); } // writing the reads of the first end into file for (l = 0; l < arrayMax (currGE->readsTranscript1); l++) { char* currRead1 = hlr_strdup( textItem (currGE->readsTranscript1,l)); // read1 readSize1 = strlen( currRead1 ); if( readSize1 == 0 ) die("Read size cannot be zero: read1[ %s ]", currRead1); if( readSize1 < minReadSize ) minReadSize = readSize1; fprintf( freads1, ">%d\n%s\n", l, currRead1 ); hlr_free( currRead1 ); } fclose( freads1 ); stringPrintf( fa2, "%s/%s_reads2.fa", confp_get(conf, "TMP_DIR"), currGE->id); if (!(freads2 = fopen ( string(fa2) ,"w"))) { die ("Unable to open file: %s",string (fa2)); } // writing the reads of the second end into file for (l = 0; l < arrayMax (currGE->readsTranscript2); l++) { char* currRead2 = hlr_strdup( textItem (currGE->readsTranscript2,l)); // read2 readSize2 = strlen( currRead2 ); if( readSize2 == 0 ) die("Read size cannot be zero: read2[ %s ]", currRead2); if( readSize2 < minReadSize ) minReadSize = readSize2; fprintf( freads2, ">%d\n%s\n", l, currRead2 ); hlr_free( currRead2 ); } fclose( freads2 ); // collapse the reads 2 ## requires the FASTX package stringPrintf( cmd, "%s -i %s/%s_reads2.fa -o %s/%s_reads2.collapsed.fa", confp_get(conf, "FASTX_COLLAPSER"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id ); hlr_system (string (cmd),0); //blat of reads2 against the first transcript stringPrintf( cmd, "%s -t=dna -out=psl -fine -tileSize=15 %s/%s_transcript1.fa %s/%s_reads2.collapsed.fa stdout",confp_get(conf, "BLAT_BLAT"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id ); // reading the results of blast from Pipe blatParser_initFromPipe( string(cmd) ); while( blQ = blatParser_nextQuery() ) { int nucleotideOverlap = getNucleotideOverlap ( blQ ); if ( nucleotideOverlap > ( ((double)readSize2)* atof(confp_get(conf,"MAX_OVERLAP_ALLOWED"))) ) { char* value = strchr(blQ->qName,'-'); homologousCount+=atoi(value+1); } } blatParser_deInit(); // collapse the reads 1 ## requires the FASTX package on the path stringPrintf( cmd, "%s -i %s/%s_reads1.fa -o %s/%s_reads1.collapsed.fa", confp_get(conf, "FASTX_COLLAPSER"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id ); hlr_system (string (cmd),0); //blat of reads1 against the second transcript stringPrintf( cmd, "%s -t=dna -out=psl -fine -tileSize=15 %s/%s_transcript2.fa %s/%s_reads1.collapsed.fa stdout",confp_get(conf, "BLAT_BLAT"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id ); blatParser_initFromPipe( string(cmd) ); while( blQ = blatParser_nextQuery() ) { int nucleotideOverlap = getNucleotideOverlap ( blQ ); if ( nucleotideOverlap > ( ((double)readSize1)* atof(confp_get(conf,"MAX_OVERLAP_ALLOWED"))) ) { char* value = strchr(blQ->qName,'-'); homologousCount+=atoi(value+1); } } blatParser_deInit(); stringPrintf (cmd,"cd %s;rm -rf %s_reads?.fa %s_reads?.collapsed.fa %s_transcript?.fa", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id); hlr_system( string(cmd) , 0); if (((double)homologousCount / (double)arrayMax(currGE->readsTranscript1)) <= atof(confp_get(conf, "MAX_FRACTION_HOMOLOGOUS")) ) { homologousCount = 0; // there is no homology between the two genes, but what about the rest of the genome writeFasta( currGE, &minReadSize, confp_get(conf, "TMP_DIR") ); stringPrintf(cmd, "cd %s; %s %s %s / -t=dna -q=dna -minScore=%d -out=psl %s_reads.fa %s.smallhomology.psl &>/dev/null", confp_get(conf, "TMP_DIR"), confp_get( conf, "BLAT_GFCLIENT"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"), minReadSize - (int)(0.1 * minReadSize) > 20 ? minReadSize - (int) (0.1 * minReadSize) : 20 , currGE->id, currGE->id); int attempts=0; ret = hlr_system( string(cmd), 1 ); while( hlr_system( string(cmd), 1 ) && attempts<5000 ) attempts++; if( attempts == 5000 ) { die("Cannot map the reads %s", string( cmd )); return EXIT_FAILURE; } // reading the results of blast from File stringPrintf(cmd, "%s/%s.smallhomology.psl", confp_get( conf, "TMP_DIR"), currGE->id); blatParser_initFromFile( string(cmd) ); tooMany = 1; while( blQ = blatParser_nextQuery() ) { tooMany = 0; checkPseudogeneOverlap( blQ ); if( arrayMax( blQ->entries ) > 1 ) { homologousCount+= arrayMax( blQ->entries ) - 1; char* value = strchr( blQ->qName,'/' ); if( value ) *value = '\0'; else die("Not a valid index in the blat query name:\t%s", blQ->qName ); int indexOfInter = atoi( blQ->qName ); // the following three lines should removed the read if writing the GFR entry GfrInterRead *currGIR = arrp( currGE->interReads, indexOfInter, GfrInterRead ); currGIR->flag = 1; } } blatParser_deInit(); if ( tooMany == 1 || ( ( (double) homologousCount / (double) ( arrayMax(currGE->readsTranscript1) + arrayMax(currGE->readsTranscript2) ) ) > atof(confp_get(conf, "MAX_FRACTION_HOMOLOGOUS")) ) ) { countRemoved++; stringPrintf (cmd,"cd %s; rm -rf %s_reads*.fa %s_reads?.collapsed.fa %s_transcript?.fa %s.smallhomology.psl", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id,currGE->id); hlr_system( string(cmd), 1 ); continue; } // writing the gfrEntry, if everthing else didn't stop if( homologousCount > 0 ) updateStats( currGE ); puts (gfr_writeGfrEntry (currGE)); count++; // removing temporary files stringPrintf (cmd,"cd %s;rm -rf %s_reads*.fa %s_reads?.collapsed.fa %s_transcript?.fa %s.smallhomology.psl", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id,currGE->id); hlr_system( string(cmd) , 1); } else { countRemoved++; } } gfr_deInit (); stringDestroy (fnSequencesToAlign); stringDestroy (cmd); stringDestroy (buffer); warn ("%s_numRemoved: %d",argv[0],countRemoved); warn ("%s_numGfrEntries: %d",argv[0],count); confp_close(conf); return EXIT_SUCCESS; }
static void generateOutput (char* prefix, char* typeSelected, int minNum) { GfrEntry *currGE; Stringa buffer; char *pos; puts ("<html>"); puts ("<head>"); puts ("<title>Results - Gene Fusions</title>"); html_printGenericStyleSheet (12); puts ("</head>"); puts ("<body>"); if (prefix[0] == '\0') { die ("Invalid prefix"); } printf ("<h1>Results - %s</h1><br><br><br>",prefix); buffer = stringCreate(50); //Chromosome expression, if present LineStream ls; char* chrSignal=NULL; stringPrintf(buffer, "ls -1 %s/BGRS/%s_chr*.bgr.gz 2> /dev/null", confp_get(Conf, "WEB_DATA_DIR"), prefix); ls = ls_createFromPipe(string(buffer)); int countCol = 0; puts ("Expression signal: "); fflush(stdout); while( chrSignal = ls_nextLine(ls)) { char* chrTmp = stringBetween( prefix, ".bgr.gz", chrSignal ); chrTmp++; printf ("[<a href=%s&hgt.customText=%s/BGRS/%s_%s.bgr.gz target='blank'>%s</a>] ", htmlLinker_generateLinkToGenomeBrowserAtUCSC("hg18","vertebrate","human", chrTmp, confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"), 50000000 + confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), confp_get(Conf, "WEB_DATA_LINK"), prefix, chrTmp, chrTmp); if (countCol > 10) { puts( "<BR>" ); countCol=0; } countCol++; } if( countCol==0) puts( "No data available yet" ); ls_destroy(ls); puts ("<br><br>"); puts ("For a definition of SPER, DASPER and RESPER see <a href=http://rnaseq.gersteinlab.org/fusionseq/>FusionSeq</a>"); puts ("<br><br>"); puts ("<br><table border=0 width=100% align=center cellpadding=10>"); puts ("<tr align=left>"); puts ("<th>SPER</th>"); puts ("<th>DASPER</th>"); puts ("<th>RESPER</th>"); puts ("<th>Number of inter paired-end reads</th>"); puts ("<th>Type</th>"); puts ("<th>Genomic coordinates</th>"); puts ("<th>Gene symbol</th>"); puts ("<th>Description</th>"); puts ("<th>Genomic coordinates</th>"); puts ("<th>Gene symbol</th>"); puts ("<th>Description</th>"); puts ("<th></th>"); puts ("</tr>"); fflush(stdout); stringPrintf (buffer,"%s/%s.gfr", confp_get(Conf, "WEB_DATA_DIR"), prefix); gfr_init (string (buffer)); int countElements = 0; while (currGE = gfr_nextEntry ()) { if (currGE->numInter < minNum) { continue; } if (strEqual (typeSelected,"all") || strEqual (currGE->fusionType,typeSelected) || ( strEqual(currGE->fusionType,"cis") && strEqual( typeSelected,"same") ) || ( strEqual(currGE->fusionType,"read-through") && strEqual( typeSelected,"same") ) ) { if (pos = strchr (currGE->descriptionTranscript1,'|')) { *pos = '\0'; } if (pos = strchr (currGE->descriptionTranscript2,'|')) { *pos = '\0'; } puts ("<tr>"); printf ("<td align=left>%1.3f</td>\n",currGE->SPER); printf ("<td align=left>%1.3f</td>\n",currGE->DASPER); printf ("<td align=left>%1.3f</td>\n",currGE->RESPER); printf ("<td align=left>%d</td>\n",currGE->numInter); printf ("<td align=left>%s</td>\n",currGE->fusionType); printf ("<td align=left><a href=%s target=blank>%s:%d-%d</a></td>\n", htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human", currGE->chromosomeTranscript1, currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))), currGE->chromosomeTranscript1,currGE->startTranscript1,currGE->endTranscript1); printf ("<td align=left>%s</td>\n",processString (currGE->geneSymbolTranscript1)); printf ("<td align=left>%s</td>\n",currGE->descriptionTranscript1); printf ("<td align=left><a href=%s target=blank>%s:%d-%d</a></td>\n", htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human", currGE->chromosomeTranscript2, currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))), currGE->chromosomeTranscript2,currGE->startTranscript2,currGE->endTranscript2); printf ("<td align=left>%s</td>\n",processString (currGE->geneSymbolTranscript2)); printf ("<td align=left>%s</td>\n",currGE->descriptionTranscript2); printf ("<td align=left><a href=%s/showDetails_cgi?%s+%s>Details</a></td>\n", confp_get(Conf, "WEB_URL_CGI"), prefix,currGE->id); puts ("</tr>"); countElements++; } } gfr_deInit (); stringDestroy (buffer); puts ("</table><br><br>"); if( countElements == 0) puts("No fusion candidates can be found satisfying all specified criteria."); puts ("</body>"); puts ("</html>"); fflush (stdout); }
int main (int argc, char *argv[]) { GfrEntry *currGE; BLEntry *currBLE; BLEntry currQuery; FILE *fp; char *line; int count; int countRemoved; int index; WordIter w; Array blackList = arrayCreate(20, BLEntry); if (argc != 2) { usage ("%s <blackList.txt>",argv[0]); } fp = fopen( argv[1], "r" ); if( !fp ) die("Unable to open file: %s", argv[1]); // reading blacklist file LineStream ls = ls_createFromFile( argv[1] ); while( line = ls_nextLine(ls) ) { w = wordIterCreate( line, "\t", 1); currBLE = arrayp( blackList, arrayMax(blackList), BLEntry); currBLE->gene1 = hlr_strdup ( wordNext(w) ); currBLE->gene2 = hlr_strdup ( wordNext(w) ); wordIterDestroy(w); } fclose(fp); arraySort( blackList, (ARRAYORDERF) sortBlackListByName1); // beginFiltering count = 0; countRemoved = 0; gfr_init ("-"); puts (gfr_writeHeader ()); while (currGE = gfr_nextEntry ()) { // reading the gfr // creating a new query to the black list currQuery.gene1 = currGE->geneSymbolTranscript1; currQuery.gene2 = currGE->geneSymbolTranscript2; // searching against read_1/read_2 int res = arrayFind( blackList, &currQuery, &index, (ARRAYORDERF) sortBlackListByName1); if( !res ) { // not found, then searching against read_2/read_1 currQuery.gene1 = currGE->geneSymbolTranscript2; currQuery.gene2 = currGE->geneSymbolTranscript1; res = arrayFind( blackList, &currQuery, &index, (ARRAYORDERF) sortBlackListByName1 ); if( !res ) { // not found, write the instance to stdout, update the counts puts (gfr_writeGfrEntry (currGE)); count++; } else { // found: read2/read1 countRemoved++; } } else { //found: read1/read2 countRemoved++; } } gfr_deInit (); arrayDestroy( blackList ); warn ("%s_BlackListFilter: %s",argv[0], argv[1]); warn ("%s_numRemoved: %d",argv[0],countRemoved); warn ("%s_numGfrEntries: %d",argv[0],count); return 0; }
int main (int argc, char *argv[]) { FILE* ftmp = NULL; if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) return EXIT_FAILURE; cgiInit(); cgiHeader("text/html"); if (argc == 3) { GfrEntry *currGE; Stringa buffer; GfrPairCount *currGEPC; GfrInterRead *currGIR; int i; puts ("<html>"); puts ("<head>"); html_printGenericStyleSheet (12); puts ("<title>geneFusions Details</title>\n"); puts ("</head>"); puts ("<body>"); buffer = stringCreate (100); stringPrintf (buffer, "%s/%s.gfr", confp_get(Conf, "WEB_DATA_DIR"),argv[1]); gfr_init (string (buffer)); while (currGE = gfr_nextEntry ()){ fflush( stdout ); if (!strEqual (currGE->id,argv[2])) { continue; } printf ("<h1>Detailed summary for potential gene fusion candidate</h1><br>"); puts ("<table border=0 cellpadding=10>"); puts ("<tr align=left valign=top>"); puts ("<td width=400>"); puts ("<h2>Summary information</h2><br>"); printf ("<b>Identifier</b>: %s<br><br>\n",currGE->id); printf ("<b>Number of inter paired-end reads</b>: %d<br><br>\n",currGE->numInter); printf ("<b>Type</b>: %s<br><br>\n",currGE->fusionType); stringPrintf(buffer, "%s/GFF/%s.gff", confp_get(Conf, "WEB_DATA_DIR"),currGE->id); ftmp = fopen( string(buffer), "r" ); // displaying this only if data are present if (ftmp) { printf("<b>Connected Reads</b>: <a href=%s&hgt.customText=%s/GFF/%s.gff target=blank>UCSC connectivity graph</a><br>\n", htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human", currGE->chromosomeTranscript1, currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))), confp_get(Conf, "WEB_DATA_LINK"),currGE->id); fclose( ftmp ); } puts ("</td>"); puts ("<td>"); puts ("<h2>Transcript connectivity graph</h2>"); printf ("<img src=%s/IMAGES/%s.jpg alt=geneFusionImage>\n", confp_get(Conf, "WEB_DATA_LINK"), currGE->id); puts ("</td>"); puts ("<td>"); puts ("<h2>Transcript connectivity table</h2><br>"); puts ("<table border=0>"); puts ("<tr align=left>"); puts ("<th width=200>Pair Type</th>"); puts ("<th width=200>Entry transcript 1</th>"); puts ("<th width=200>Entry transcript 2</th>"); puts ("<th width=200>Counts</th>"); puts ("</tr>"); fflush( stdout ); for (i = 0; i < arrayMax (currGE->pairCounts); i++) { currGEPC = arrp (currGE->pairCounts,i,GfrPairCount); printf ("<tr><td>%s</td><td>%s</td><td>%s</td><td>%.2f</td></tr>\n", getPairTypeName(currGEPC->pairType), getEntryNumber(currGEPC->number1, currGEPC->pairType, 1), getEntryNumber(currGEPC->number2, currGEPC->pairType, 2), currGEPC->count); } puts ("</table>"); puts ("</td>"); puts ("</tr>"); puts ("</table>"); puts ("<br>"); puts ("<h2>Transcript information</h2><br>"); puts ("<table border=1 cellpadding=10 width=\"80%\">"); puts ("<tr align=left>"); puts ("<th width=\"20%\"></th>"); puts ("<th><font color='blue'>Transcript 1</font></th>"); puts ("<th><font color='orange'>Transcript 2</font></th>"); puts ("</tr>"); puts ("<tr align=left>"); puts ("<td width=\"20%\"><b>Gene symbol(s)</b></td>"); printf ("<td width=\"30%%\"><font color='blue'>%s</font></td>\n",processString (currGE->geneSymbolTranscript1)); printf ("<td width=\"30%%\"><font color='orange'>%s</font></td>\n",processString (currGE->geneSymbolTranscript2)); puts ("</tr>"); puts ("<tr align=left>"); puts ("<td width=\"20%\"><b>Coordinates</b></td>"); printf ("<td width=\"30%%\">%s:%d-%d</td>\n",currGE->chromosomeTranscript1,currGE->startTranscript1,currGE->endTranscript1); printf ("<td width=\"30%%\">%s:%d-%d</td>\n",currGE->chromosomeTranscript2,currGE->startTranscript2,currGE->endTranscript2); puts ("</tr>"); puts ("<tr align=left>"); puts ("<td width=\"20%\"><b>Strand</b></td>"); printf ("<td width=\"30%%\">%c</td>\n",currGE->strandTranscript1); printf ("<td width=\"30%%\">%c</td>\n",currGE->strandTranscript2); puts ("</tr>"); puts ("<tr align=left>"); puts ("<td width=\"20%\"><b>Gene description(s)</b></td>"); printf ("<td width=\"30%%\">%s</td>\n",processString (currGE->descriptionTranscript1)); printf ("<td width=\"30%%\">%s</td>\n",processString (currGE->descriptionTranscript2)); puts ("</tr>"); puts ("<tr align=left>"); puts ("<td width=\"20%\"><b>Number of exons</b></td>"); printf ("<td width=\"30%%\">%d</td>\n",currGE->numExonsTranscript1); printf ("<td width=\"30%%\">%d</td>\n",currGE->numExonsTranscript2); puts ("</tr>"); puts ("<tr align=left>"); puts ("<td width=\"20%\"><b>Number of intra paired-end reads</b></td>"); printf ("<td width=\"30%%\">%d</td>\n",currGE->numIntra1); printf ("<td width=\"30%%\">%d</td>\n",currGE->numIntra2); puts ("</tr>"); puts ("<tr align=left>"); puts ("<td width=\"20%\"><b>Links</b></td>"); printf ("<td width=\"30%%\">[<a href=%s&hgt.customText=%s/BED/%s_1.bed target=blank>UCSC genome browser</a>] [<a href=%s/FASTA/%s_1.fasta>FASTA file</a>]<br></td>\n", htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human", currGE->chromosomeTranscript1, currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))), confp_get(Conf, "WEB_DATA_LINK"), currGE->id, confp_get(Conf, "WEB_DATA_LINK"), currGE->id); printf ("<td width=\"30%%\">[<a href=%s&hgt.customText=%s/BED/%s_2.bed target=blank>UCSC genome browser</a>] [<a href=%s/FASTA/%s_2.fasta>FASTA file</a>]<br></td></tr>\n", htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human", currGE->chromosomeTranscript2, currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))), confp_get(Conf, "WEB_DATA_LINK"), currGE->id, confp_get(Conf, "WEB_DATA_LINK"), currGE->id); puts ("<tr align=left>"); puts ("<td width=\"20%\"><b>Expression</b></td>"); stringPrintf(buffer, "%s/BGRS/%s_%s.bgr.gz", confp_get(Conf, "WEB_DATA_DIR"), argv[1], currGE->chromosomeTranscript1); ftmp = fopen( string(buffer), "r" ); // displaying this only if data are present puts("<td width=\"30%\">"); if( ftmp ) { printf ("[<a href=%s&hgt.customText=%s/BGRS/%s_%s.bgr.gz target=blank>Expression %s</a>]", htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human", currGE->chromosomeTranscript1, currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))), confp_get(Conf, "WEB_DATA_LINK"), argv[1], currGE->chromosomeTranscript1, currGE->chromosomeTranscript1); fclose(ftmp); } puts("</td>"); stringPrintf(buffer, "%s/BGRS/%s_%s.bgr.gz", confp_get(Conf, "WEB_DATA_DIR"),argv[1],currGE->chromosomeTranscript2); ftmp = fopen( string(buffer), "r" ); // displaying this only if data are present puts("<td width=\"30%\">"); if( ftmp ) { printf ("[<a href=%s&hgt.customText=%s/BGRS/%s_%s.bgr.gz target=blank>Expression %s</a>]", htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human", currGE->chromosomeTranscript2, currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))), confp_get(Conf, "WEB_DATA_LINK"), argv[1], currGE->chromosomeTranscript2, currGE->chromosomeTranscript2); fclose(ftmp); } puts("</td>"); puts("</tr>"); puts ("</table><br><br>"); puts ("<h2>Breakpoint analysis</h2><br>"); puts ("<table border=1 width=\"80%\" cellpadding=10><thead><tr><th>Orientation</th><th>Alignments</th><th colspan=2>Breakpoints</th></tr></thead><tbody>"); puts ("<tr><td>Orientation AB</td>"); if (currGE->strandTranscript1=='+') { currGE->strandTranscript2=='+' ? stringPrintf(buffer, "AB_trans1F_trans2F") : stringPrintf(buffer, "AB_trans1F_trans2R"); } else if( currGE->strandTranscript1 == '-') { currGE->strandTranscript2=='+' ? stringPrintf(buffer, "AB_trans1R_trans2F") : stringPrintf(buffer, "AB_trans1R_trans2R"); } else { die("Strand informatation is not correct (transcript 1): %c", currGE->strandTranscript1); } printf ("<td align=center><a href=%s/ALIGNMENTS/%s_AB_breakPointAlignments.txt><img src=%s/IMAGES/%s.png></img> AB</a></td>", confp_get(Conf, "WEB_DATA_LINK"), currGE->id, confp_get(Conf, "WEB_DATA_LINK"), string(buffer)); printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_AB_breakPointsTranscript1.wig target=blank>Breakpoints transcript 1 UCSC Genome Browser</a></td>", htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human", currGE->chromosomeTranscript1, currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))), confp_get(Conf, "WEB_DATA_LINK"), currGE->id); printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_AB_breakPointsTranscript2.wig target=blank>Breakpoints transcript 2 UCSC Genome Browser</a></td></tr>", htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human", currGE->chromosomeTranscript2, currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))), confp_get(Conf, "WEB_DATA_LINK"), currGE->id); fflush(stdout); puts ("<tr><td>Orientation BA</td>"); if (currGE->strandTranscript1 == '+') { currGE->strandTranscript2=='+' ? stringPrintf(buffer, "BA_trans1F_trans2F") : stringPrintf(buffer, "BA_trans1F_trans2R"); } else if( currGE->strandTranscript1 == '-') { currGE->strandTranscript2=='+' ? stringPrintf(buffer, "BA_trans1R_trans2F") : stringPrintf(buffer, "BA_trans1R_trans2R"); } else { die("Strand informatation is not correct (transcript2): %c", currGE->strandTranscript2); } printf ("<td align=center><a href=%s/ALIGNMENTS/%s_BA_breakPointAlignments.txt><img src=%s/IMAGES/%s.png></img> BA</a></td>", confp_get(Conf, "WEB_DATA_LINK"), currGE->id, confp_get(Conf, "WEB_DATA_LINK"), string(buffer)); printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_BA_breakPointsTranscript2.wig target=blank>Breakpoints transcript 2 UCSC Genome Browser</a></td>", htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human", currGE->chromosomeTranscript2, currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))), confp_get(Conf, "WEB_DATA_LINK"), currGE->id); printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_BA_breakPointsTranscript1.wig target=blank>Breakpoints transcript 1 UCSC Genome Browser</a></td></tr>", htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human", currGE->chromosomeTranscript1, currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")), currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))), confp_get(Conf, "WEB_DATA_LINK"), currGE->id); puts ("</tbody></table>"); puts ("<br><br><br>"); fflush(stdout); puts ("<h2>Read coordinates</h2><br>"); puts ("<table border=0>"); puts ("<tr align=left>"); puts ("<th width=\"10%\">Pair Type</th>"); puts ("<th width=\"10%\">Entry Transcript 1</th>"); puts ("<th width=\"10%\">Read start transcript 1</th>"); puts ("<th width=\"10%\">Read end transcript 1</th>"); puts ("<th width=\"10%\">Entry Transcript 2</th>"); puts ("<th width=\"10%\">Read start transcript 2</th>"); puts ("<th width=\"10%\">Read end transcript 2</th>"); puts ("</tr>"); for (i = 0; i < arrayMax (currGE->interReads); i++) { currGIR = arrp (currGE->interReads,i,GfrInterRead); printf ("<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%s</td><td>%d</td><td>%d</td></tr>\n", getPairTypeName(currGIR->pairType), getEntryNumber(currGIR->number1, currGIR->pairType, 1), currGIR->readStart1,currGIR->readEnd1, getEntryNumber(currGIR->number2,currGIR->pairType, 2), currGIR->readStart2, currGIR->readEnd2); } puts ("</table><br><br><br>"); puts ("</body>"); puts ("</html>"); fflush (stdout); } } confp_close(Conf); return EXIT_SUCCESS; }
int main (int argc, char *argv[]) { GfrEntry *currGE; BLEntry *currBLE; BLEntry currQuery; FILE *fp; char *line; int count; int countRemoved; int index; WordIter w; Array blackList = arrayCreate(20, BLEntry); config *Conf; if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) { die("%s:\tCannot find .fusionseqrc: %s", argv[0], getenv("FUSIONSEQ_CONFPATH")); return EXIT_FAILURE; } if( confp_get( Conf, "ANNOTATION_DIR")==NULL ) { die("%s:\tCannot find ANNOTATION_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } if( confp_get( Conf, "BLACKLIST_FILENAME")==NULL ) { die("%s:\tCannot find BLACKLIST_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") ); return EXIT_FAILURE; } Stringa buffer=stringCreate( 100 ); stringPrintf( buffer, "%s/%s", confp_get( Conf, "ANNOTATION_DIR"), confp_get( Conf, "BLACKLIST_FILENAME") ); /* fp = fopen( string( buffer ), "r" ); if( !fp ) die("Unable to open file: %s", string(buffer)); stringDestroy( buffer ); */ // reading blacklist file LineStream ls = ls_createFromFile( string(buffer) ); while( line = ls_nextLine(ls) ) { w = wordIterCreate( line, "\t", 1); currBLE = arrayp( blackList, arrayMax(blackList), BLEntry); currBLE->gene1 = hlr_strdup ( wordNext(w) ); currBLE->gene2 = hlr_strdup ( wordNext(w) ); wordIterDestroy(w); } //fclose(fp); ls_destroy( ls ); stringDestroy( buffer ); arraySort( blackList, (ARRAYORDERF) sortBlackListByName1); // beginFiltering count = 0; countRemoved = 0; gfr_init ("-"); puts (gfr_writeHeader ()); while (currGE = gfr_nextEntry ()) { // reading the gfr if( currGE->geneSymbolTranscript1 == NULL ) { die("Gene symbols are not present in the GFR file. Please run gfrAddInfo before gfrBlackListFilter."); return EXIT_FAILURE; } // creating a new query to the black list currQuery.gene1 = currGE->geneSymbolTranscript1; currQuery.gene2 = currGE->geneSymbolTranscript2; if( strEqual( currQuery.gene1 , currQuery.gene2 ) ) { countRemoved++; continue; } // searching against read_1/read_2 int res = arrayFind( blackList, &currQuery, &index, (ARRAYORDERF) sortBlackListByName1); if( !res ) { // not found, then searching against read_2/read_1 currQuery.gene1 = currGE->geneSymbolTranscript2; currQuery.gene2 = currGE->geneSymbolTranscript1; res = arrayFind( blackList, &currQuery, &index, (ARRAYORDERF) sortBlackListByName1 ); if( !res ) { // not found, write the instance to stdout, update the counts puts (gfr_writeGfrEntry (currGE)); count++; } else { // found: read2/read1 countRemoved++; } } else { //found: read1/read2 countRemoved++; } } gfr_deInit (); arrayDestroy( blackList ); warn ("%s_BlackListFilter: %s",argv[0], confp_get( Conf, "BLACKLIST_FILENAME")); warn ("%s_numRemoved: %d",argv[0],countRemoved); warn ("%s_numGfrEntries: %d",argv[0],count); confp_close( Conf); return 0; }