int main (int argc, char *argv[])
{
    GfrEntry *currGE;
    int count;
    int countRemoved;

    if (argc != 2) {
        usage ("%s <string>",argv[0]);
    }
    count = 0;
    countRemoved = 0;
    gfr_init ("-");
    puts (gfr_writeHeader ());
    while (currGE = gfr_nextEntry ()) {
        if (currGE->descriptionTranscript1 == NULL ||
                currGE->descriptionTranscript2 == NULL) {
            die ("Transcript description is missing");
        }
        if (strCaseStr (currGE->descriptionTranscript1,argv[1]) ||
                strCaseStr (currGE->descriptionTranscript2,argv[1])) {
            countRemoved++;
            continue;
        }
        puts (gfr_writeGfrEntry (currGE));
        count++;
    }
    gfr_deInit ();
    warn ("%s_string: %s",argv[0],argv[1]);
    warn ("%s_numRemoved: %d",argv[0],countRemoved);
    warn ("%s_numGfrEntries: %d",argv[0],count);
    return 0;
}
Exemplo n.º 2
0
int main (int argc, char *argv[])
{
	GfrEntry *currGE;
	int count;
	int countRemoved;
	int i;

	if (argc != 3) {
		usage ("%s <offsetCutoff> <minNumUniqueReads>",argv[0]);
	}
	count = 0;
	countRemoved = 0;

	int offsetCutOff = atoi (argv[1]);
	int minNumUniqueReads = atoi (argv[2]);

	gfr_init ("-");
	puts (gfr_writeHeader ());
	while (currGE = gfr_nextEntry ()) {
		Array starts = arrayCreate( 100, int);
		for (i = 0; i < arrayMax( currGE->interReads ); i++) {
			int currStart = arrp(currGE->interReads, i, GfrInterRead)->readStart1 + arrp(currGE->interReads, i, GfrInterRead)->readStart2;
			array(starts, arrayMax(starts), int) = currStart; 
		}
		arraySort( starts, (ARRAYORDERF) arrayIntcmp );
		arrayUniq( starts, NULL, (ARRAYORDERF) arrayIntcmp ) ;
		int numUniqeOffsets = arrayMax( starts );
		arrayDestroy( starts );

	if (arrayMax( currGE->readsTranscript1 ) != arrayMax( currGE->readsTranscript2 ) )
		die( "The two ends have a different number of reads");
	Texta reads = textCreate(arrayMax(currGE->readsTranscript1));
	for (i = 0; i < arrayMax(currGE->readsTranscript1); i++) {
		Stringa strA = stringCreate( strlen(textItem( currGE->readsTranscript1, i) ) * 2 + 1);
		stringAppendf( strA, textItem( currGE->readsTranscript1,i));
		stringAppendf( strA, textItem( currGE->readsTranscript2,i)); 
		textAdd( reads, string(strA));
		stringDestroy( strA );
	}
	textUniqKeepOrder( reads );
	int numRemaining = arrayMax( reads );
	textDestroy ( reads );

	if (numRemaining <= minNumUniqueReads || numUniqeOffsets <= offsetCutOff) {
		countRemoved++;
		continue;
	} 
	puts (gfr_writeGfrEntry (currGE));
	count++;
	}
	gfr_deInit ();
	warn("%s_PCRFilter: offset=%d minNumUniqueReads=%d",
	     argv[0],offsetCutOff, minNumUniqueReads);
	warn("%s_numRemoved: %d",argv[0],countRemoved);
	warn("%s_numGfrEntries: %d",argv[0],count);
	return 0;
}
Exemplo n.º 3
0
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  int count;
  int countRemoved; 
  int i, j;
  int foundEST;
 
  if (argc != 2) {
    usage ("%s <EST.interval>",argv[0]);
  }  
  intervalFind_addIntervalsToSearchSpace( argv[1], 0);	

  // beginFiltering
  count = 0;
  countRemoved = 0;
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()) { // reading the gfr
    foundEST = 0;
    if( strEqual( currGE->fusionType, "cis" ) ) {
      if( ! strEqual( currGE->chromosomeTranscript1, currGE->chromosomeTranscript2 ) )
	die("The two genes are not on the same chromosomes: %s - %s",  currGE->chromosomeTranscript1, currGE->chromosomeTranscript2 );
      int start1, end1, start2, end2;
      findCoordinates( currGE, &start1, &end1, &start2, &end2 );
      
      Array intervals1 = arrayCopy( intervalFind_getOverlappingIntervals( currGE->chromosomeTranscript1, start1, end1 ) ); 
      Array intervals2 = intervalFind_getOverlappingIntervals( currGE->chromosomeTranscript2, start2, end2 );
      for( i=0; i<arrayMax( intervals1 ); i++ ) {
	Interval* currInterval1 = arru( intervals1, i, Interval* );
	for( j=0; j<arrayMax ( intervals2 ); j++ ) {
	  Interval* currInterval2 = arru( intervals2, j, Interval* );
	  if( currInterval1==currInterval2 ) {
	    foundEST = 1;
	    i = arrayMax( intervals1 );
	    j = arrayMax( intervals2 );
	  }
	}
      }
      arrayDestroy( intervals1 );
      
    }
    if( foundEST )
      countRemoved++;
    else {
      puts (gfr_writeGfrEntry (currGE));
      count++;
    }
  }	           
  gfr_deInit ();
  warn ("%s_EST_data: %s",argv[0], argv[1]);
  warn ("%s_numRemoved: %d",argv[0], countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);
  return 0;
}
Exemplo n.º 4
0
int main (int argc, char *argv[])
{
	GfrEntry *currGE;
	int count;

	gfr_init ("-");
	gfr_addNewColumnType (GFR_COLUMN_NAME_PAIR_COUNT);
	puts (gfr_writeHeader ());
	count = 0;
	while (currGE = gfr_nextEntry ()){
		obtainPairCounts (currGE);
		puts (gfr_writeGfrEntry (currGE)); fflush (stdout);
		count++;
	}
	gfr_deInit ();
	warn ("%s_numGfrEntries: %d",argv[0],count);
	return 0;
}
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  Array kgTreeFams;
  Stringa buffer;
  int count;
  int countRemoved;

  config *conf;

  if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL)
    return EXIT_FAILURE;

  buffer = stringCreate (100);
  stringPrintf (buffer,"%s/%s",
                confp_get(conf, "ANNOTATION_DIR"), 
		confp_get(conf, "KNOWN_GENE_TREE_FAM_FILENAME"));
  kgTreeFams = util_readKnownGeneTreeFams (string (buffer));
  arraySort (kgTreeFams,(ARRAYORDERF)sortKgTreeFamsByTranscriptName);
  stringDestroy (buffer);

  count = 0;
  countRemoved = 0;
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()){
    if (isHomologous (kgTreeFams,currGE->nameTranscript1,currGE->nameTranscript2)) {
      countRemoved++;
      continue;
    }
    puts (gfr_writeGfrEntry (currGE));
    count++;
  }
  gfr_deInit ();
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);

  confp_close(conf);

  return EXIT_SUCCESS;
}
Exemplo n.º 6
0
int main (int argc, char *argv[])
{
	GfrEntry *currGE;
	GfrInterRead *currGIR;
	int i;
	Stringa buffer;
	FILE *fp1,*fp2;
	int count;

	count = 0;
	buffer = stringCreate (100);
	gfr_init ("-");
	puts (gfr_writeHeader ());
	while (currGE = gfr_nextEntry ()) {
		stringPrintf (buffer,"%s_1.bed",currGE->id);
		fp1 = fopen (string (buffer),"w");
		stringPrintf (buffer,"%s_2.bed",currGE->id);
		fp2 = fopen (string (buffer),"w");
		if (fp1 == NULL || fp2 == NULL) {
			die ("Unable to open BED files");
		}
		fprintf (fp1,"browser full knownGene\n");
		fprintf (fp1,"track name=\"Inter paird-ends: %s_1\" visibility=2\n",currGE->id);
		fprintf (fp2,"browser full knownGene\n");
		fprintf (fp2,"track name=\"Inter paird-ends: %s_2\" visibility=2\n",currGE->id);
		for (i = 0; i < arrayMax (currGE->interReads); i++) {
			currGIR = arrp (currGE->interReads,i,GfrInterRead);
			fprintf (fp1,"%s\t%d\t%d\n",currGE->chromosomeTranscript1,currGIR->readStart1,currGIR->readEnd1);
			fprintf (fp2,"%s\t%d\t%d\n",currGE->chromosomeTranscript2,currGIR->readStart2,currGIR->readEnd2);
		}
		fclose (fp1);
		fclose (fp2);
		puts (gfr_writeGfrEntry (currGE));
		count++;
	}
	gfr_deInit ();
	stringDestroy (buffer);
	warn ("%s_numGfrEntries: %d",argv[0],count);
	return 0;
}
Exemplo n.º 7
0
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  int count;
  int countRemoved;
  int mitochondrialCount; 
  unsigned int minReadSize;
  int  i;
  Stringa cmd;
  BlatQuery *blQ=NULL;
  config *conf = NULL; /**< Pointer to configuration file .fusionseqrc  */

  if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) {
    die("%s:\tCannot find .fusionseqrc: %s", argv[0], getenv("FUSIONSEQ_CONFPATH"));
    return EXIT_FAILURE;
  }
  if( confp_get( conf,"MAX_OVERLAP_ALLOWED")==NULL ) {
    die("%s:\tCannot find MAX_OVERLAP_ALLOWED in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  if( confp_get( conf,"MAX_FRACTION_HOMOLOGOUS")==NULL ) {
    die("%s:\tCannot find MAX_FRACTION_HOMOLOGOUS in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "MITOCHONDRIAL_DIR")==NULL ) {
    die("%s:\tCannot find MITOCHONDRIAL_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  if( confp_get( conf, "MITOCHONDRIAL_FILENAME")==NULL ) {
    die("%s:\tCannot find MITOCHONDRIAL_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
if( confp_get( conf, "TMP_DIR")==NULL ) {
    die("%s:\tCannot find TMP_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "BLAT_GFSERVER")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "BLAT_GFCLIENT")==NULL ) {
    die("%s:\tCannot find BLAT_GFCLIENT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
if( confp_get( conf, "BLAT_GFSERVER_HOST")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER_HOST in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }if( confp_get( conf, "BLAT_GFSERVER_PORT")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER_PORT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }

  count = 0;
  countRemoved = 0;
  
  cmd = stringCreate (100);
  // initializing the gfServers
  stringPrintf( cmd, "%s status %s %d &> /dev/null", confp_get( conf, "BLAT_GFSERVER"),  confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2);
  int ret = hlr_system( string(cmd), 1 );
   if( ret != 0 ) { // not initialized
    stringPrintf( cmd , "%s -repMatch=100000 -tileSize=12 -canStop -log=%s/gfServer_mitochondrial.log start %s %d %s/%s  &", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "TMP_DIR"),  confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2, confp_get( conf, "MITOCHONDRIAL_DIR"), confp_get( conf,"MITOCHONDRIAL_FILENAME"));
    hlr_system( string( cmd ), 0 );
    long int startTime = time(0);
    stringPrintf( cmd , "%s status %s %d &> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2);
    while( hlr_system( string(cmd), 1) && (time(0)-startTime)<600 ) ;
    if( hlr_system( string(cmd), 1 ) != 0 )  {
      die("gfServer for %s/%s not initialized: %s %s %s", confp_get( conf, "MITOCHONDRIAL_DIR"), confp_get( conf, "MITOCHONDRIAL_FILENAME"), confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT")); 
      return EXIT_FAILURE;
    }
  } 

 
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()) {
    if (strEqual(currGE->chromosomeTranscript1, "chrM") || 
	strEqual(currGE->chromosomeTranscript2, "chrM")) {
      countRemoved++;
      continue;
    } else {
      mitochondrialCount = 0;
      minReadSize=1000;
      writeFasta( currGE, &minReadSize, confp_get( conf, "TMP_DIR") ); // in util.c
      stringPrintf(cmd, "cd %s;%s %s %d / -t=dna -q=dna -minScore=%d -out=psl %s_reads.fa %s.mito.psl &>/dev/null", confp_get( conf, "TMP_DIR"), confp_get( conf, "BLAT_GFCLIENT"), confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2, minReadSize - 5 > 20 ? minReadSize - 5 : 20 , currGE->id, currGE->id);
      int attempts=0;
      ret = hlr_system( string(cmd), 1 );
      while( hlr_system( string(cmd), 1 ) && attempts<5000 ) attempts++;
      if( attempts == 5000 ) {
	die("Cannot map the reads %s", string( cmd ));
	return EXIT_FAILURE;
      }

      // reading the results of blast from File
      stringPrintf(cmd,  "%s/%s.mito.psl", confp_get( conf, "TMP_DIR"), currGE->id);
      blatParser_initFromFile( string(cmd) );
      while( blQ = blatParser_nextQuery() ) {
	//warn("iter %d\tquery %s", iter, blQ->qName );iter++; 
	int nucleotideOverlap = getNucleotideOverlap ( blQ );
	if (nucleotideOverlap > (((double) minReadSize) * strtod(confp_get( conf, "MAX_OVERLAP_ALLOWED"), NULL))) {
	  char* value = strchr( blQ->qName,'/' );
	  if( value ) *value = '\0'; else die("Not a valid index in the blat query name:\t%s", blQ->qName );
	  int indexOfInter = atoi( blQ->qName ); // the following three lines should removed the read if writing the GFR entry
	  GfrInterRead *currGIR = arrp( currGE->interReads, indexOfInter, GfrInterRead );
	  currGIR->flag = 1;
	  mitochondrialCount++;
	} 
      }
      blatParser_deInit();
      if ( ( (double) mitochondrialCount / (double) ( arrayMax(currGE->readsTranscript1) + arrayMax(currGE->readsTranscript2) ) ) <= strtod(confp_get( conf, "MAX_FRACTION_HOMOLOGOUS"), NULL)) {   
	if( mitochondrialCount > 0 ) updateStats( currGE );
	// writing the gfrEntry
	puts (gfr_writeGfrEntry (currGE));
	count++;
      } else {
	countRemoved++;
      }
      // removing temporary files
      stringPrintf (cmd,"rm -rf %s/%s_reads.fa %s/%s.mito.psl", confp_get( conf, "TMP_DIR"),  currGE->id, confp_get( conf, "TMP_DIR"),  currGE->id );
      hlr_system( string(cmd) , 1);      
    } 
    
  }
  gfr_deInit ();
 
  stringDestroy( cmd );
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);
  confp_close(conf);
  return 0;
}
Exemplo n.º 8
0
static void generateOutput (char* prefix, char* typeSelected, int minNum)
{
  GfrEntry *currGE;
  Stringa buffer;
  char *pos;

  puts ("<html>");
  puts ("<head>");
  puts ("<title>Results - Gene Fusions</title>");
  html_printGenericStyleSheet (12);
  puts ("</head>");
  puts ("<body>");
  if (prefix[0] == '\0') {
    die ("Invalid prefix");
  }
  printf ("<h1>Results - %s</h1><br><br><br>",prefix);

  buffer = stringCreate(50);
  //Chromosome expression, if present
  LineStream ls;
  char* chrSignal=NULL;  
  stringPrintf(buffer, "ls -1 %s/BGRS/%s_chr*.bgr.gz 2> /dev/null", 
	       confp_get(Conf, "WEB_DATA_DIR"), 
	       prefix);
  ls = ls_createFromPipe(string(buffer));
  int countCol = 0;
  puts ("Expression signal: &nbsp;");
  fflush(stdout);
  while( chrSignal = ls_nextLine(ls)) {
        
	char* chrTmp = stringBetween( prefix, ".bgr.gz", chrSignal );
	chrTmp++;      
	printf ("[<a href=%s&hgt.customText=%s/BGRS/%s_%s.bgr.gz target='blank'>%s</a>]&nbsp;",
		htmlLinker_generateLinkToGenomeBrowserAtUCSC("hg18","vertebrate","human", chrTmp, 
			confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"), 
			50000000 + confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			confp_get(Conf, "WEB_DATA_LINK"), 
			prefix, 
			chrTmp, 
			chrTmp); 
	if (countCol > 10) {
	  puts( "<BR>" );
	  countCol=0;
	}
	countCol++;
  }
  if( countCol==0) puts( "No data available yet" );
  ls_destroy(ls);
  puts ("<br><br>");
  puts ("For a definition of SPER, DASPER and RESPER see <a href=http://rnaseq.gersteinlab.org/fusionseq/>FusionSeq</a>");
  puts ("<br><br>");
  puts ("<br><table border=0 width=100% align=center cellpadding=10>");
  puts ("<tr align=left>");
  puts ("<th>SPER</th>");
  puts ("<th>DASPER</th>");
  puts ("<th>RESPER</th>");
  puts ("<th>Number of inter paired-end reads</th>");
  puts ("<th>Type</th>");
  puts ("<th>Genomic coordinates</th>");
  puts ("<th>Gene symbol</th>");
  puts ("<th>Description</th>");
  puts ("<th>Genomic coordinates</th>");
  puts ("<th>Gene symbol</th>");
  puts ("<th>Description</th>");
  puts ("<th></th>");
  puts ("</tr>");
  fflush(stdout);

  stringPrintf (buffer,"%s/%s.gfr", confp_get(Conf, "WEB_DATA_DIR"), prefix);
  gfr_init (string (buffer));
  int countElements = 0;
  while (currGE = gfr_nextEntry ()) {
    if (currGE->numInter < minNum) {
      continue;
    }
    if (strEqual (typeSelected,"all") || strEqual (currGE->fusionType,typeSelected) || 
	( strEqual(currGE->fusionType,"cis") && strEqual( typeSelected,"same") ) ||
	( strEqual(currGE->fusionType,"read-through") && strEqual( typeSelected,"same") ) ) {
      if (pos = strchr (currGE->descriptionTranscript1,'|')) {
        *pos = '\0';
      }
      if (pos = strchr (currGE->descriptionTranscript2,'|')) {
        *pos = '\0';
      }
      puts ("<tr>");
      printf ("<td align=left>%1.3f</td>\n",currGE->SPER);
      printf ("<td align=left>%1.3f</td>\n",currGE->DASPER);
      printf ("<td align=left>%1.3f</td>\n",currGE->RESPER);
      printf ("<td align=left>%d</td>\n",currGE->numInter);
      printf ("<td align=left>%s</td>\n",currGE->fusionType);
      printf ("<td align=left><a href=%s target=blank>%s:%d-%d</a></td>\n",
              htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
			currGE->chromosomeTranscript1,
			currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
     	      currGE->chromosomeTranscript1,currGE->startTranscript1,currGE->endTranscript1);
      printf ("<td align=left>%s</td>\n",processString (currGE->geneSymbolTranscript1));
      printf ("<td align=left>%s</td>\n",currGE->descriptionTranscript1);
      printf ("<td align=left><a href=%s target=blank>%s:%d-%d</a></td>\n",
              htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
		     	currGE->chromosomeTranscript2,
			currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
              currGE->chromosomeTranscript2,currGE->startTranscript2,currGE->endTranscript2);
      printf ("<td align=left>%s</td>\n",processString (currGE->geneSymbolTranscript2));
      printf ("<td align=left>%s</td>\n",currGE->descriptionTranscript2);
      printf ("<td align=left><a href=%s/showDetails_cgi?%s+%s>Details</a></td>\n", confp_get(Conf, "WEB_URL_CGI"), prefix,currGE->id);
      puts ("</tr>");
      countElements++;
    }
  }
  gfr_deInit ();
  stringDestroy (buffer);
  puts ("</table><br><br>");
  if( countElements == 0) puts("No fusion candidates can be found satisfying all specified criteria.");
  puts ("</body>");
  puts ("</html>");
  fflush (stdout);
}
Exemplo n.º 9
0
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  BLEntry *currBLE;
  BLEntry currQuery;
  FILE *fp;
  char *line;
  int count;
  int countRemoved;
  
  int index;
  WordIter w;
  Array blackList = arrayCreate(20, BLEntry);

  if (argc != 2) {
    usage ("%s <blackList.txt>",argv[0]);
  }  
  fp = fopen( argv[1], "r" );
  
  if( !fp )  die("Unable to open file: %s", argv[1]);
  // reading blacklist file
  LineStream ls = ls_createFromFile( argv[1] );
  while( line = ls_nextLine(ls) ) {
    w = wordIterCreate( line, "\t", 1);
    currBLE = arrayp( blackList, arrayMax(blackList), BLEntry);
    currBLE->gene1 = hlr_strdup ( wordNext(w) );
    currBLE->gene2 = hlr_strdup ( wordNext(w) );    
    wordIterDestroy(w);
  }
  fclose(fp);
  arraySort( blackList, (ARRAYORDERF) sortBlackListByName1);

  // beginFiltering
  count = 0;
  countRemoved = 0;
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()) { // reading the gfr
    // creating a new query to the black list
    currQuery.gene1 = currGE->geneSymbolTranscript1;
    currQuery.gene2 = currGE->geneSymbolTranscript2;
    // searching against read_1/read_2
    int res = arrayFind( blackList, &currQuery, 
			 &index,  (ARRAYORDERF) sortBlackListByName1);  
    
    if( !res ) { // not found, then searching against read_2/read_1
      currQuery.gene1 = currGE->geneSymbolTranscript2;
      currQuery.gene2 = currGE->geneSymbolTranscript1;
      
      res =  arrayFind( blackList, &currQuery, 
			&index, (ARRAYORDERF) sortBlackListByName1 );
      
      if( !res ) { // not found, write the instance to stdout, update the counts
	puts (gfr_writeGfrEntry (currGE));
	count++;	
      } else { // found: read2/read1
	countRemoved++;
      }	
    } else { //found: read1/read2
      countRemoved++;
    }
  }	           
  gfr_deInit ();
  arrayDestroy( blackList );
  warn ("%s_BlackListFilter: %s",argv[0], argv[1]);
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);
  return 0;
}
Exemplo n.º 10
0
int main (int argc, char *argv[]) 
{
  FILE* ftmp = NULL;
  
  if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL)
    return EXIT_FAILURE;
  
  cgiInit();
  cgiHeader("text/html");

  if (argc == 3) {
    GfrEntry *currGE;
    Stringa buffer;
    GfrPairCount *currGEPC;
    GfrInterRead *currGIR;
    int i;

    puts ("<html>");
    puts ("<head>");
    html_printGenericStyleSheet (12);
    puts ("<title>geneFusions Details</title>\n");
    puts ("</head>");
    puts ("<body>");
    buffer = stringCreate (100);
    stringPrintf (buffer, "%s/%s.gfr", confp_get(Conf, "WEB_DATA_DIR"),argv[1]);    
    gfr_init (string (buffer));
    while (currGE = gfr_nextEntry ()){
      fflush( stdout );
      if (!strEqual (currGE->id,argv[2])) {
        continue;
      }
      printf ("<h1>Detailed summary for potential gene fusion candidate</h1><br>");
      puts ("<table border=0 cellpadding=10>");
      puts ("<tr align=left valign=top>");
      puts ("<td width=400>");
      puts ("<h2>Summary information</h2><br>");
      printf ("<b>Identifier</b>: %s<br><br>\n",currGE->id);
      printf ("<b>Number of inter paired-end reads</b>: %d<br><br>\n",currGE->numInter);
      printf ("<b>Type</b>: %s<br><br>\n",currGE->fusionType);     
      
      stringPrintf(buffer, "%s/GFF/%s.gff", confp_get(Conf, "WEB_DATA_DIR"),currGE->id);       
      ftmp = fopen( string(buffer), "r" ); // displaying this only if data are present
      if (ftmp) {
	 printf("<b>Connected Reads</b>: <a href=%s&hgt.customText=%s/GFF/%s.gff target=blank>UCSC connectivity graph</a><br>\n",
              	htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
			currGE->chromosomeTranscript1,
			currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
              confp_get(Conf, "WEB_DATA_LINK"),currGE->id); 
	 fclose( ftmp );
      }
   
      puts ("</td>");
      puts ("<td>");
      puts ("<h2>Transcript connectivity graph</h2>");
      printf ("<img src=%s/IMAGES/%s.jpg alt=geneFusionImage>\n", confp_get(Conf, "WEB_DATA_LINK"), currGE->id);
      puts ("</td>");
      puts ("<td>");
      puts ("<h2>Transcript connectivity table</h2><br>");
      puts ("<table border=0>");
      puts ("<tr align=left>");
      puts ("<th width=200>Pair Type</th>");
      puts ("<th width=200>Entry transcript 1</th>");
      puts ("<th width=200>Entry transcript 2</th>");
      puts ("<th width=200>Counts</th>");
      puts ("</tr>");
      fflush( stdout );
      for (i = 0; i < arrayMax (currGE->pairCounts); i++) {
        currGEPC = arrp (currGE->pairCounts,i,GfrPairCount);	
        printf ("<tr><td>%s</td><td>%s</td><td>%s</td><td>%.2f</td></tr>\n", 
		        getPairTypeName(currGEPC->pairType), 
		        getEntryNumber(currGEPC->number1, currGEPC->pairType, 1),
		        getEntryNumber(currGEPC->number2, currGEPC->pairType, 2),
		        currGEPC->count);
      }
      puts ("</table>");
      puts ("</td>");
      puts ("</tr>");
      puts ("</table>");
      puts ("<br>");

      puts ("<h2>Transcript information</h2><br>");
      puts ("<table border=1 cellpadding=10 width=\"80%\">");
      puts ("<tr align=left>");
      puts ("<th width=\"20%\"></th>");
      puts ("<th><font color='blue'>Transcript 1</font></th>");
      puts ("<th><font color='orange'>Transcript 2</font></th>");
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Gene symbol(s)</b></td>");
      printf ("<td width=\"30%%\"><font color='blue'>%s</font></td>\n",processString (currGE->geneSymbolTranscript1));
      printf ("<td width=\"30%%\"><font color='orange'>%s</font></td>\n",processString (currGE->geneSymbolTranscript2));
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Coordinates</b></td>");
      printf ("<td width=\"30%%\">%s:%d-%d</td>\n",currGE->chromosomeTranscript1,currGE->startTranscript1,currGE->endTranscript1);
      printf ("<td width=\"30%%\">%s:%d-%d</td>\n",currGE->chromosomeTranscript2,currGE->startTranscript2,currGE->endTranscript2);
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Strand</b></td>");
      printf ("<td width=\"30%%\">%c</td>\n",currGE->strandTranscript1);
      printf ("<td width=\"30%%\">%c</td>\n",currGE->strandTranscript2);
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Gene description(s)</b></td>");
      printf ("<td width=\"30%%\">%s</td>\n",processString (currGE->descriptionTranscript1));
      printf ("<td width=\"30%%\">%s</td>\n",processString (currGE->descriptionTranscript2));
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Number of exons</b></td>");
      printf ("<td width=\"30%%\">%d</td>\n",currGE->numExonsTranscript1);
      printf ("<td width=\"30%%\">%d</td>\n",currGE->numExonsTranscript2);
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Number of intra paired-end reads</b></td>");
      printf ("<td width=\"30%%\">%d</td>\n",currGE->numIntra1);
      printf ("<td width=\"30%%\">%d</td>\n",currGE->numIntra2);
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Links</b></td>");
      printf ("<td width=\"30%%\">[<a href=%s&hgt.customText=%s/BED/%s_1.bed target=blank>UCSC genome browser</a>]&nbsp;&nbsp;&nbsp;[<a href=%s/FASTA/%s_1.fasta>FASTA file</a>]<br></td>\n",
              htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
		      currGE->chromosomeTranscript1,
		      currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
		      currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
              confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id,
	      confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id); 
      printf ("<td width=\"30%%\">[<a href=%s&hgt.customText=%s/BED/%s_2.bed target=blank>UCSC genome browser</a>]&nbsp;&nbsp;&nbsp;[<a href=%s/FASTA/%s_2.fasta>FASTA file</a>]<br></td></tr>\n",
              htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
		      currGE->chromosomeTranscript2,
		      currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
		      currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
              confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id,
	      confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id); 
      
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Expression</b></td>"); 

      stringPrintf(buffer, "%s/BGRS/%s_%s.bgr.gz", 
		   confp_get(Conf, "WEB_DATA_DIR"),
		   argv[1],
		   currGE->chromosomeTranscript1);  
      ftmp = fopen( string(buffer), "r" ); // displaying this only if data are present
      puts("<td width=\"30%\">");
      if( ftmp ) {
	printf ("[<a href=%s&hgt.customText=%s/BGRS/%s_%s.bgr.gz target=blank>Expression %s</a>]",
		htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
			currGE->chromosomeTranscript1,
			currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
		confp_get(Conf, "WEB_DATA_LINK"),
		argv[1],
		currGE->chromosomeTranscript1,
		currGE->chromosomeTranscript1); 
	fclose(ftmp);
      }
      puts("</td>");

      stringPrintf(buffer, "%s/BGRS/%s_%s.bgr.gz", confp_get(Conf, "WEB_DATA_DIR"),argv[1],currGE->chromosomeTranscript2); 
      ftmp = fopen( string(buffer), "r" ); // displaying this only if data are present
      puts("<td width=\"30%\">");
      if( ftmp ) {
	printf ("[<a href=%s&hgt.customText=%s/BGRS/%s_%s.bgr.gz target=blank>Expression %s</a>]",
		htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
			currGE->chromosomeTranscript2,
			currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
		confp_get(Conf, "WEB_DATA_LINK"),
		argv[1],
		currGE->chromosomeTranscript2,
		currGE->chromosomeTranscript2); 
	fclose(ftmp);
      } 
      puts("</td>");
      puts("</tr>");
      puts ("</table><br><br>");
      
      puts ("<h2>Breakpoint analysis</h2><br>");
      puts ("<table border=1 width=\"80%\" cellpadding=10><thead><tr><th>Orientation</th><th>Alignments</th><th colspan=2>Breakpoints</th></tr></thead><tbody>");
      puts ("<tr><td>Orientation AB</td>");
	if (currGE->strandTranscript1=='+') {
	  currGE->strandTranscript2=='+' ? stringPrintf(buffer, "AB_trans1F_trans2F") : stringPrintf(buffer, "AB_trans1F_trans2R");
	} else if( currGE->strandTranscript1 == '-') {
	  currGE->strandTranscript2=='+' ? stringPrintf(buffer, "AB_trans1R_trans2F") : stringPrintf(buffer, "AB_trans1R_trans2R");
	} else {
	  die("Strand informatation is not correct (transcript 1): %c", currGE->strandTranscript1);
	}
	printf ("<td align=center><a href=%s/ALIGNMENTS/%s_AB_breakPointAlignments.txt><img src=%s/IMAGES/%s.png></img>&nbsp;AB</a></td>", 
		confp_get(Conf, "WEB_DATA_LINK"), 
		currGE->id, 
		confp_get(Conf, "WEB_DATA_LINK"), 
		string(buffer)); 
	printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_AB_breakPointsTranscript1.wig target=blank>Breakpoints transcript 1 UCSC Genome Browser</a></td>", 
		htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
			currGE->chromosomeTranscript1,
			currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
		confp_get(Conf, "WEB_DATA_LINK"),
		currGE->id);
	printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_AB_breakPointsTranscript2.wig target=blank>Breakpoints transcript 2 UCSC Genome Browser</a></td></tr>", 
		htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
			currGE->chromosomeTranscript2,
			currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
		confp_get(Conf, "WEB_DATA_LINK"),
		currGE->id);	
      fflush(stdout);
      puts   ("<tr><td>Orientation BA</td>");  
      if (currGE->strandTranscript1 == '+') {
	currGE->strandTranscript2=='+' ? stringPrintf(buffer, "BA_trans1F_trans2F") : stringPrintf(buffer, "BA_trans1F_trans2R");
      } else if( currGE->strandTranscript1 == '-') {
	currGE->strandTranscript2=='+' ? stringPrintf(buffer, "BA_trans1R_trans2F") : stringPrintf(buffer, "BA_trans1R_trans2R");
      } else {
	die("Strand informatation is not correct (transcript2): %c", currGE->strandTranscript2);
	}	
      printf ("<td align=center><a href=%s/ALIGNMENTS/%s_BA_breakPointAlignments.txt><img src=%s/IMAGES/%s.png></img>&nbsp;BA</a></td>",
	      confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id, 
	      confp_get(Conf, "WEB_DATA_LINK"),
	      string(buffer));
      printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_BA_breakPointsTranscript2.wig target=blank>Breakpoints transcript 2 UCSC Genome Browser</a></td>",	
	      htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
		      currGE->chromosomeTranscript2,
		      currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
	      	      currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
	      confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id);
      printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_BA_breakPointsTranscript1.wig target=blank>Breakpoints transcript 1 UCSC Genome Browser</a></td></tr>", 
	      htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
		      currGE->chromosomeTranscript1,
		      currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
		      currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
	      confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id);       

      puts ("</tbody></table>");
      puts ("<br><br><br>");
      fflush(stdout);
    
    
      puts ("<h2>Read coordinates</h2><br>");
      puts ("<table border=0>");
      puts ("<tr align=left>");
      puts ("<th width=\"10%\">Pair Type</th>");
      puts ("<th width=\"10%\">Entry Transcript 1</th>");
      puts ("<th width=\"10%\">Read start transcript 1</th>");
      puts ("<th width=\"10%\">Read end transcript 1</th>");
      puts ("<th width=\"10%\">Entry Transcript 2</th>");
      puts ("<th width=\"10%\">Read start transcript 2</th>");
      puts ("<th width=\"10%\">Read end transcript 2</th>");
      puts ("</tr>");     
      for (i = 0; i < arrayMax (currGE->interReads); i++) {
	currGIR = arrp (currGE->interReads,i,GfrInterRead);
	printf ("<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%s</td><td>%d</td><td>%d</td></tr>\n",
		getPairTypeName(currGIR->pairType), 
		getEntryNumber(currGIR->number1, currGIR->pairType, 1),
		currGIR->readStart1,currGIR->readEnd1,
		getEntryNumber(currGIR->number2,currGIR->pairType, 2),
		currGIR->readStart2,
		currGIR->readEnd2);
      }
      puts ("</table><br><br><br>");
      puts ("</body>");
      puts ("</html>");
    fflush (stdout);
    }
  }
  confp_close(Conf);
  
  return EXIT_SUCCESS;
}
Exemplo n.º 11
0
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  BLEntry *currBLE;
  BLEntry currQuery;
  FILE *fp;
  char *line;
  int count;
  int countRemoved;
  
  int index;
  WordIter w;
  Array blackList = arrayCreate(20, BLEntry);
  config *Conf;

  if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) {
    die("%s:\tCannot find .fusionseqrc: %s", argv[0], getenv("FUSIONSEQ_CONFPATH"));
    return EXIT_FAILURE;
  }
  if( confp_get( Conf, "ANNOTATION_DIR")==NULL ) {
    die("%s:\tCannot find ANNOTATION_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  if( confp_get( Conf, "BLACKLIST_FILENAME")==NULL ) {
    die("%s:\tCannot find BLACKLIST_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  Stringa buffer=stringCreate( 100 );
  stringPrintf( buffer, "%s/%s", confp_get( Conf, "ANNOTATION_DIR"), confp_get( Conf, "BLACKLIST_FILENAME") );
  /*  fp = fopen( string( buffer ), "r" );
  if( !fp )  die("Unable to open file: %s", string(buffer));
  stringDestroy( buffer );
  */ 
// reading blacklist file
  LineStream ls = ls_createFromFile( string(buffer) );
  while( line = ls_nextLine(ls) ) {
    w = wordIterCreate( line, "\t", 1);
    currBLE = arrayp( blackList, arrayMax(blackList), BLEntry);
    currBLE->gene1 = hlr_strdup ( wordNext(w) );
    currBLE->gene2 = hlr_strdup ( wordNext(w) );    
    wordIterDestroy(w);
  }
  //fclose(fp);
  ls_destroy( ls );
  stringDestroy( buffer );
  arraySort( blackList, (ARRAYORDERF) sortBlackListByName1);

  // beginFiltering
  count = 0;
  countRemoved = 0;
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()) { // reading the gfr
    if( currGE->geneSymbolTranscript1 == NULL ) {
      die("Gene symbols are not present in the GFR file. Please run gfrAddInfo before gfrBlackListFilter.");
      return EXIT_FAILURE;
    }
	
    // creating a new query to the black list
    currQuery.gene1 = currGE->geneSymbolTranscript1;
    currQuery.gene2 = currGE->geneSymbolTranscript2;
    if( strEqual( currQuery.gene1 , currQuery.gene2 ) ) {
	countRemoved++;
	continue;
      }
    // searching against read_1/read_2
    int res = arrayFind( blackList, &currQuery, 
			 &index,  (ARRAYORDERF) sortBlackListByName1);  
    
    if( !res ) { // not found, then searching against read_2/read_1
      currQuery.gene1 = currGE->geneSymbolTranscript2;
      currQuery.gene2 = currGE->geneSymbolTranscript1;
      
      res =  arrayFind( blackList, &currQuery, 
			&index, (ARRAYORDERF) sortBlackListByName1 );
      
      if( !res ) { // not found, write the instance to stdout, update the counts
	puts (gfr_writeGfrEntry (currGE));
	count++;	
      } else { // found: read2/read1
	countRemoved++;
      }	
    } else { //found: read1/read2
      countRemoved++;
    }
  }	           
  gfr_deInit ();
  arrayDestroy( blackList );
  warn ("%s_BlackListFilter: %s",argv[0], confp_get( Conf, "BLACKLIST_FILENAME"));
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);
  confp_close( Conf);
  return 0;
}