Exemple #1
0
static char* getBreakPointSequence (char *tileCoordinate1, char *tileCoordinate2)
{
	Stringa buffer;
	Stringa targetsFile;
	FILE *fp;
	Array targetSeqs;
	int i;
	Seq *currSeq;
	static Stringa sequence = NULL;

	buffer = stringCreate (100);
	targetsFile = stringCreate (100);
	stringPrintf (targetsFile,"targets_%d.txt",getpid ());
	if (!(fp = fopen (string (targetsFile),"w")) ){
		die ("Unable to open target file: %s",string (targetsFile));
	}
	fprintf (fp,"%s\n%s",tileCoordinate1,tileCoordinate2);
	fclose (fp);

	stringPrintf (buffer,"%s %s/%s stdout -noMask -seqList=%s",
		      confp_get(Conf, "BLAT_TWO_BIT_TO_FA"),
		      confp_get(Conf, "BLAT_DATA_DIR"),
		      confp_get(Conf, "BLAT_TWO_BIT_DATA_FILENAME"),
		      string (targetsFile));
	fasta_initFromPipe (string (buffer));
	targetSeqs = fasta_readAllSequences (0);
	fasta_deInit ();
	if (arrayMax (targetSeqs) != 2) {
		die ("Expected only two target sequences");
	} 
	stringCreateClear (sequence,100);
	for (i = 0; i < arrayMax (targetSeqs); i++) {
		currSeq = arrp (targetSeqs,i,Seq);
		stringAppendf (sequence,"%s",currSeq->sequence);
		hlr_free (currSeq->name);
		hlr_free (currSeq->sequence);
	}
	arrayDestroy (targetSeqs);
	stringPrintf (buffer,"rm -rf %s",string (targetsFile));
	hlr_system (string (buffer),0);
	stringDestroy (targetsFile);
	stringDestroy (buffer);
	return string (sequence);
}
static char* lookUpTreeFam (Array kgTreeFams, char *transcript) 
{
  KgTreeFam testKGTF;
  int index;
  int foundIt;
   
  foundIt = 0;
  testKGTF.transcriptName = hlr_strdup (transcript);
  foundIt = arrayFind (kgTreeFams,&testKGTF,&index,(ARRAYORDERF)sortKgTreeFamsByTranscriptName);
  hlr_free (testKGTF.transcriptName);
  if (foundIt) {
    return  arrp (kgTreeFams,index,KgTreeFam)->treeFamId;
  }
  return NULL;
}
int main (int argc, char *argv[])
{
	Array kgXrefs;
	Stringa buffer;
	LineStream ls;
	int count=0;
	char* geneSymbolTranscript;
	char* descriptionTranscript;
	char* line;
	char* exonID = NULL;

	config *conf;

	if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL)
		return EXIT_FAILURE;

	buffer = stringCreate (100);

	stringPrintf (buffer,"%s/%s",
		      confp_get(conf, "ANNOTATION_DIR"),
		      confp_get(conf, "KNOWN_GENE_XREF_FILENAME"));
	kgXrefs = util_readKnownGeneXrefs (string (buffer));
	arraySort (kgXrefs,(ARRAYORDERF)sortKgXrefsByTranscriptName);
	stringDestroy (buffer);

	//  gfr_init ("-");
	 ls = ls_createFromFile("-");
  
	while (line = ls_nextLine(ls)) {
		char *lineP = hlr_strdup(line);
		WordIter w = wordIterCreate( line, "\t", 0);
		char *nameTranscript = wordNext( w );
		char *p = rindex(nameTranscript, '_');
		if (p) {
			exonID = hlr_strdup( p+1 );
			*p='\0';
		}
		transcript2geneSymbolAndGeneDescription(kgXrefs,
							nameTranscript,
							&geneSymbolTranscript,
							&descriptionTranscript);
		if (exonID) {
			printf("%s_%s\t%s\t%s\t%s", 
				nameTranscript, 
				exonID,
				geneSymbolTranscript, 
				exonID, 
				descriptionTranscript);
			hlr_free(exonID);
		} else {
			printf("%s\t%s\t1\t%s", 
				nameTranscript, 
			
	geneSymbolTranscript, 
				descriptionTranscript);
		}
		printf("%s\n", lineP+strlen(nameTranscript));
		count++;
		hlr_free(lineP);
		wordIterDestroy(w);
	}
	ls_destroy (ls);
	warn ("%s_numGfrEntries: %d",argv[0],count);
	confp_close(conf);

	return EXIT_SUCCESS;
}
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  int i,j,k,l, h,index;
  Stringa buffer,cmd,fnSequencesToAlign;
  FILE *fp;
  FILE *fp1;
  FILE *fp2;
  FILE *freads1;
  FILE *freads2;
  Array gfrEntries;
  BowtieQuery *currBQ,testBQ;
  BowtieEntry *currBE;
  Texta seqNames;
  int readSize1, readSize2, minReadSize;
  Array bowtieQueries;
  char transcriptNumber;
  int isHomologous,homologousCount;
  int count;
  int countRemoved;
  unsigned short int tooMany;
  BlatQuery *blQ;

  config *conf;

  if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) {
    die("%s:\tCannot find .fusionseqrc", argv[0]);
    return EXIT_FAILURE;
  } 
  if ( (confp_get( conf, "BLAT_TWO_BIT_TO_FA")) == NULL) {
    die("%s:\tCannot find BLAT_TWO_BIT_TO_FA in the configuration file: %s", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  } 
  if ( (confp_get( conf,"BLAT_DATA_DIR")) == NULL) {
    die("%s:\tCannot find BLAT_DATA_DIR in the configuration file: %sc", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  } 
 if( confp_get( conf, "TMP_DIR")==NULL ) {
    die("%s:\tCannot find TMP_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  if( confp_get( conf, "BLAT_GFSERVER")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "BLAT_GFCLIENT")==NULL ) {
    die("%s:\tCannot find BLAT_GFCLIENT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
if( confp_get( conf, "BLAT_GFSERVER_HOST")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER_HOST in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }if( confp_get( conf, "BLAT_GFSERVER_PORT")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER_PORT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "PSEUDOGENE_DIR")==NULL ) {
   die("%s:\tCannot find PSEUDOGENE_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
   return EXIT_FAILURE;
 }
 if( confp_get( conf, "PSEUDOGENE_FILENAME")==NULL ) {
   die("%s:\tCannot find PSEUDOGENE_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
   return EXIT_FAILURE;
 }
 
  cmd = stringCreate (100);
  // initializing the gfServers
  stringPrintf( cmd, "%s status %s %s &> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT") );
  int ret = hlr_system( string(cmd), 1 );
  if( ret != 0 ) { // not initialized
    stringPrintf( cmd , "%s -repMatch=100000 -tileSize=12 -canStop -log=%s/gfServer_genome.log start %s %s %s/%s  &", confp_get( conf, "BLAT_GFSERVER"), confp_get(conf, "TMP_DIR"),confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"), confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"));
    hlr_system( string( cmd ), 0 );
    long int startTime = time(0);
    stringPrintf( cmd , "%s status %s %s &2> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"));
    while( hlr_system( string(cmd), 1) && (time(0)-startTime)<600 ) ;
    if( hlr_system( string(cmd), 1 ) != 0 )  {
      die("gfServer for %s/%s not initialized: %s %s %s", confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT")); 
      return EXIT_FAILURE;
    }
  } 
  // end initialization

  
  gfr_init ("-");
  gfrEntries =  gfr_parse ();
  if (arrayMax (gfrEntries) == 0){
    puts (gfr_writeHeader ());
    gfr_deInit ();
    return 0;
  }
  seqNames = textCreate (10000); 
  buffer = stringCreate (100);
  fnSequencesToAlign = stringCreate (100);
  count = 0;
  countRemoved = 0;

  stringPrintf( buffer, "%s/%s", confp_get( conf, "PSEUDOGENE_DIR"), confp_get( conf, "PSEUDOGENE_FILENAME") );
  intervalFind_addIntervalsToSearchSpace (string(buffer),0);

  puts (gfr_writeHeader ());
 
  for (i = 0; i < arrayMax (gfrEntries); i++) {
    currGE = arrp (gfrEntries,i,GfrEntry);
    homologousCount = 0;
    minReadSize=10000;
    // creating two fasta files with the two genes
    
    stringPrintf( cmd, "%s %s/%s -seq=%s -start=%d -end=%d %s/%s_transcript1.fa", confp_get(conf, "BLAT_TWO_BIT_TO_FA") , confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), currGE->chromosomeTranscript1, currGE->startTranscript1, currGE->endTranscript1, confp_get(conf, "TMP_DIR"), currGE->id);
    hlr_system( string(cmd) , 0);   
    stringPrintf( cmd, "%s %s/%s -seq=%s -start=%d -end=%d %s/%s_transcript2.fa", confp_get(conf, "BLAT_TWO_BIT_TO_FA"),  confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), currGE->chromosomeTranscript2, currGE->startTranscript2, currGE->endTranscript2, confp_get(conf, "TMP_DIR"), currGE->id);
    hlr_system( string(cmd) , 0);   
    
    Stringa fa1 = stringCreate( 100 ); 
    Stringa fa2 = stringCreate( 100 );
    
    // creating the two fasta files with the reads
    stringPrintf( fa1, "%s/%s_reads1.fa", confp_get(conf, "TMP_DIR"), currGE->id);
    if (!(freads1 = fopen ( string(fa1) ,"w"))) {
      die ("Unable to open file: %s",string (fa1));
    }   
    // writing the reads of the first end into file
    
    for (l = 0; l < arrayMax (currGE->readsTranscript1); l++) {
      char* currRead1 = hlr_strdup( textItem (currGE->readsTranscript1,l)); // read1
      readSize1 = strlen( currRead1 );
      if( readSize1 == 0 ) die("Read size cannot be zero: read1[ %s ]", currRead1);
      if( readSize1 < minReadSize ) minReadSize = readSize1;
      fprintf( freads1, ">%d\n%s\n", l, currRead1 );
      hlr_free( currRead1 );
    }
    fclose( freads1 );
    
    stringPrintf( fa2, "%s/%s_reads2.fa", confp_get(conf, "TMP_DIR"), currGE->id);
    if (!(freads2 = fopen ( string(fa2) ,"w"))) {
      die ("Unable to open file: %s",string (fa2));
    } 
    // writing the reads of the second end into file
    for (l = 0; l < arrayMax (currGE->readsTranscript2); l++) {
      char* currRead2 = hlr_strdup( textItem (currGE->readsTranscript2,l)); // read2
      readSize2 = strlen( currRead2 );
      if( readSize2 == 0 ) die("Read size cannot be zero: read2[ %s ]", currRead2);
      if( readSize2 < minReadSize ) minReadSize = readSize2;
      fprintf( freads2, ">%d\n%s\n", l, currRead2 );
      hlr_free( currRead2 );
    }
    fclose( freads2 );      
    
    // collapse the reads 2  ## requires the FASTX package
    stringPrintf( cmd, "%s -i %s/%s_reads2.fa -o %s/%s_reads2.collapsed.fa", confp_get(conf, "FASTX_COLLAPSER"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id  );
    hlr_system (string (cmd),0);
    
    //blat of reads2 against the first transcript
    stringPrintf( cmd, "%s -t=dna -out=psl -fine -tileSize=15 %s/%s_transcript1.fa %s/%s_reads2.collapsed.fa stdout",confp_get(conf, "BLAT_BLAT"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id );
    
    // reading the results of blast from Pipe
    blatParser_initFromPipe( string(cmd) );
    while( blQ = blatParser_nextQuery() ) {
      int nucleotideOverlap = getNucleotideOverlap ( blQ );
      if ( nucleotideOverlap > ( ((double)readSize2)* atof(confp_get(conf,"MAX_OVERLAP_ALLOWED"))) ) {
	char* value = strchr(blQ->qName,'-');
	homologousCount+=atoi(value+1);
      }
    }
    blatParser_deInit();
    
    // collapse the reads 1 ## requires the FASTX package on the path
    stringPrintf( cmd, "%s -i %s/%s_reads1.fa -o %s/%s_reads1.collapsed.fa", confp_get(conf, "FASTX_COLLAPSER"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id  );
    hlr_system (string (cmd),0);
    
    //blat of reads1 against the second transcript
    stringPrintf( cmd, "%s -t=dna -out=psl -fine -tileSize=15 %s/%s_transcript2.fa %s/%s_reads1.collapsed.fa stdout",confp_get(conf, "BLAT_BLAT"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id  );
    
    blatParser_initFromPipe( string(cmd) );
    while( blQ = blatParser_nextQuery() ) {		
      int nucleotideOverlap = getNucleotideOverlap ( blQ );
      if ( nucleotideOverlap > ( ((double)readSize1)* atof(confp_get(conf,"MAX_OVERLAP_ALLOWED"))) ) {
	char* value = strchr(blQ->qName,'-');
	homologousCount+=atoi(value+1);
      }
    }
    blatParser_deInit();
    stringPrintf (cmd,"cd %s;rm -rf %s_reads?.fa %s_reads?.collapsed.fa %s_transcript?.fa", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id);
    hlr_system( string(cmd) , 0);      
    if (((double)homologousCount / (double)arrayMax(currGE->readsTranscript1)) <= atof(confp_get(conf, "MAX_FRACTION_HOMOLOGOUS")) ) { 
      homologousCount = 0;
      // there is no homology between the two genes, but what about the rest of the genome
      writeFasta( currGE, &minReadSize,  confp_get(conf, "TMP_DIR") );
      stringPrintf(cmd, "cd %s; %s %s %s / -t=dna -q=dna -minScore=%d -out=psl %s_reads.fa %s.smallhomology.psl &>/dev/null", confp_get(conf, "TMP_DIR"), confp_get( conf, "BLAT_GFCLIENT"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"), minReadSize - (int)(0.1 * minReadSize) > 20 ? minReadSize - (int) (0.1 * minReadSize) : 20 ,  currGE->id,  currGE->id);
      int attempts=0;
      ret = hlr_system( string(cmd), 1 );
      while( hlr_system( string(cmd), 1 ) && attempts<5000 ) attempts++;
      if( attempts == 5000 ) {
	die("Cannot map the reads %s", string( cmd ));
	return EXIT_FAILURE;
      }
      // reading the results of blast from File
      stringPrintf(cmd,  "%s/%s.smallhomology.psl", confp_get( conf, "TMP_DIR"), currGE->id);
      blatParser_initFromFile( string(cmd) );
      tooMany = 1;
      while( blQ = blatParser_nextQuery() ) {
	tooMany = 0;
	checkPseudogeneOverlap( blQ );
	if( arrayMax( blQ->entries ) > 1 ) {
	  homologousCount+= arrayMax( blQ->entries ) - 1;
	  char* value = strchr( blQ->qName,'/' );
	  if( value ) *value = '\0'; else die("Not a valid index in the blat query name:\t%s", blQ->qName );
	  int indexOfInter = atoi( blQ->qName ); // the following three lines should removed the read if writing the GFR entry
	  GfrInterRead *currGIR = arrp( currGE->interReads, indexOfInter, GfrInterRead );
	  currGIR->flag = 1;
	}
      }
      blatParser_deInit();
      if (  tooMany == 1 || ( ( (double) homologousCount / (double) ( arrayMax(currGE->readsTranscript1) + arrayMax(currGE->readsTranscript2) ) )  > atof(confp_get(conf, "MAX_FRACTION_HOMOLOGOUS")) ) ) {
	countRemoved++;
	stringPrintf (cmd,"cd %s; rm -rf %s_reads*.fa %s_reads?.collapsed.fa %s_transcript?.fa %s.smallhomology.psl", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id,currGE->id);
	hlr_system( string(cmd), 1 );
	continue;
      }
      // writing the gfrEntry, if everthing else didn't stop 
      if( homologousCount > 0 ) updateStats( currGE );
      puts (gfr_writeGfrEntry (currGE));
      count++;
      // removing temporary files
      stringPrintf (cmd,"cd %s;rm -rf %s_reads*.fa %s_reads?.collapsed.fa %s_transcript?.fa  %s.smallhomology.psl", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id,currGE->id);
      hlr_system( string(cmd) , 1);      
    } else {
      countRemoved++;
    }
    
  }

  gfr_deInit ();

  stringDestroy (fnSequencesToAlign);
  stringDestroy (cmd);
  stringDestroy (buffer);
  warn ("%s_numRemoved: %d",argv[0],countRemoved);  
  warn ("%s_numGfrEntries: %d",argv[0],count);

  confp_close(conf);

  return EXIT_SUCCESS;
}
Exemple #5
0
int main (int argc, char *argv[])
{
  Array intervals;
  Interval *currInterval;
  SubInterval *currSubInterval;
  int h,i,j;
  Array seqs;
  Seq *currSeq,testSeq;
  int index;
  Stringa buffer;
  Array geneTranscriptEntries;
  Texta geneTranscriptIds;
  Array alterations;
  Alteration *currAlteration,*nextAlteration;
  char *proteinSequenceBeforeIndel;
  char *proteinSequenceAfterIndel;
  int numDisabledTranscripts;
  Stringa disabledTranscripts;
  int seqLength,refLength,altLength;
  char *sequenceBeforeIndel = NULL;
  int overlapMode;
  int numOverlaps;
  int sizeIndel,indelOffset;
  int overlap;
  Array coordinates;
  VcfEntry *currVcfEntry;
  VcfGenotype *currVcfGenotype;
  int position;
  Texta alternateAlleles;
  int flag1,flag2;
  
  if (argc != 3) {
    usage ("%s <annotation.interval> <annotation.fa>",argv[0]);
  }
  intervalFind_addIntervalsToSearchSpace (argv[1],0);
  geneTranscriptEntries = util_getGeneTranscriptEntries (intervalFind_getAllIntervals ());
  seq_init ();
  fasta_initFromFile (argv[2]);
  seqs = fasta_readAllSequences (0);
  fasta_deInit ();
  arraySort (seqs,(ARRAYORDERF)util_sortSequencesByName); 
  buffer = stringCreate (100);
  disabledTranscripts = stringCreate (100);
  alterations = arrayCreate (100,Alteration);
  vcf_init ("-");
  stringPrintf (buffer,"##INFO=<ID=VA,Number=.,Type=String,Description=\"Variant Annotation, %s\">",argv[1]);
  vcf_addComment (string (buffer));
  puts (vcf_writeMetaData ());
  puts (vcf_writeColumnHeaders ());
  while (currVcfEntry = vcf_nextEntry ()) {
    if (vcf_isInvalidEntry (currVcfEntry)) {
      continue;
    }
    flag1 = 0;
    flag2 = 0;
    position = currVcfEntry->position - 1; // make zero-based
    alternateAlleles = vcf_getAlternateAlleles (currVcfEntry);
    for (h = 0; h < arrayMax (alternateAlleles); h++) {
      refLength = strlen (currVcfEntry->referenceAllele);
      altLength = strlen (textItem (alternateAlleles,h));
      sizeIndel = abs (refLength - altLength);
      indelOffset = MAX (refLength,altLength) - 1; 
      util_clearAlterations (alterations);
      intervals = intervalFind_getOverlappingIntervals (currVcfEntry->chromosome,position,position + indelOffset);
      for (i = 0; i < arrayMax (intervals); i++) {
        currInterval = arru (intervals,i,Interval*);
        overlapMode = OVERLAP_NONE;
        numOverlaps = 0;
        for (j = 0; j < arrayMax (currInterval->subIntervals); j++) {
          currSubInterval = arrp (currInterval->subIntervals,j,SubInterval);
          overlap = rangeIntersection (position,position + indelOffset,currSubInterval->start,currSubInterval->end);
          if (currSubInterval->start <= position && (position + indelOffset) < currSubInterval->end) {
            overlapMode = OVERLAP_FULLY_CONTAINED;
            numOverlaps++;
          }
          else if (j == 0 && overlap > 0 && position < currSubInterval->start) {
            overlapMode = OVERLAP_START;
            numOverlaps++;
          }
          else if (j == (arrayMax (currInterval->subIntervals) - 1) && overlap > 0 && (position + indelOffset) >= currSubInterval->end) {
            overlapMode = OVERLAP_END;
            numOverlaps++;
          }
          else if (overlap > 0 && overlap <= indelOffset) {
            overlapMode = OVERLAP_SPLICE;
            numOverlaps++;
          }
        }
        if (overlapMode == OVERLAP_NONE) {
          continue;
        }
        currAlteration = arrayp (alterations,arrayMax (alterations),Alteration);
        if (numOverlaps > 1) {
          util_addAlteration (currAlteration,currInterval->name,"multiExonHit",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_SPLICE) {
          util_addAlteration (currAlteration,currInterval->name,"spliceOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_START) {
          util_addAlteration (currAlteration,currInterval->name,"startOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_END) {
          util_addAlteration (currAlteration,currInterval->name,"endOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength > refLength) {
          if ((sizeIndel % 3) == 0) {
            util_addAlteration (currAlteration,currInterval->name,"insertionNFS",currInterval,position,0);
          }
          else {
            util_addAlteration (currAlteration,currInterval->name,"insertionFS",currInterval,position,0);
          }
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength < refLength) {
          if ((sizeIndel % 3) == 0) {
            util_addAlteration (currAlteration,currInterval->name,"deletionNFS",currInterval,position,0);
          }
          else {
            util_addAlteration (currAlteration,currInterval->name,"deletionFS",currInterval,position,0);
          }
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength == refLength) {
          util_addAlteration (currAlteration,currInterval->name,"substitution",currInterval,position,0);
        }
        else {
          die ("Unexpected type: %d %s %s %s",
               currVcfEntry->position,currVcfEntry->chromosome,
               currVcfEntry->referenceAllele,currVcfEntry->alternateAllele);
        }
        if ((sizeIndel % 3) != 0 && altLength != refLength) { 
          continue;
        }
        // Only run the remaining block of code if the indel is fully contained (insertion or deletion) AND does not cause a frameshift OR
        // if it is a substitution that is fully contained in the coding sequence
        stringPrintf (buffer,"%s|%s|%c|",currInterval->name,currInterval->chromosome,currInterval->strand);
        for (j = 0; j < arrayMax (currInterval->subIntervals); j++) {
          currSubInterval = arrp (currInterval->subIntervals,j,SubInterval);
          stringAppendf (buffer,"%d|%d%s",currSubInterval->start,currSubInterval->end,j < arrayMax (currInterval->subIntervals) - 1 ? "|" : "");
        }
        testSeq.name = hlr_strdup (string (buffer));
        if (!arrayFind (seqs,&testSeq,&index,(ARRAYORDERF)util_sortSequencesByName)) {
          die ("Expected to find %s in seqs",string (buffer));
        }
        hlr_free (testSeq.name);
        currSeq = arrp (seqs,index,Seq);
        strReplace (&sequenceBeforeIndel,currSeq->sequence);
        seqLength = strlen (sequenceBeforeIndel); 
        coordinates = util_getCoordinates (currInterval);
        // arraySort (coordinates,(ARRAYORDERF)util_sortCoordinatesByChromosomeAndTranscriptPosition); Array is already sorted by definition
        j = 0;
        stringClear (buffer);
        while (j < seqLength) {
          if (util_getGenomicCoordinate (coordinates,j,currVcfEntry->chromosome) == position) {
            if (altLength > refLength) {
              stringCat (buffer,textItem (alternateAlleles,h));
              j++;
              continue;
            }
            else if (altLength < refLength) {
              stringCatChar (buffer,sequenceBeforeIndel[j]);
              j = j + refLength - altLength + 1;
              continue;
            }
            else {
              stringCat (buffer,textItem (alternateAlleles,h));
              j = j + altLength;
              continue;
            }
          }
          stringCatChar (buffer,sequenceBeforeIndel[j]);
          j++;
        }
        util_destroyCoordinates (coordinates);
        proteinSequenceBeforeIndel = hlr_strdup (util_translate (currInterval,sequenceBeforeIndel));
        proteinSequenceAfterIndel = hlr_strdup (util_translate (currInterval,string (buffer)));
        addSubstitution (currAlteration,proteinSequenceBeforeIndel,proteinSequenceAfterIndel,indelOffset);
        hlr_free (proteinSequenceBeforeIndel);
        hlr_free (proteinSequenceAfterIndel);
      }
      if (arrayMax (alterations) == 0) {
        continue;
      }
      arraySort (alterations,(ARRAYORDERF)util_sortAlterationsByGeneIdAndType);
      stringClear (buffer);
      i = 0;
      while (i < arrayMax (alterations)) {
        currAlteration = arrp (alterations,i,Alteration);
        stringAppendf (buffer,"%s%d:%s:%s:%c:%s",stringLen (buffer) == 0 ? "" : ",",h + 1,currAlteration->geneName,currAlteration->geneId,currAlteration->strand,currAlteration->type);
         stringClear (disabledTranscripts);
        if (currAlteration->substitution[0] != '\0') {
          stringAppendf (disabledTranscripts,"%s:%s:%d_%d_%s",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition,currAlteration->substitution);
        }
        else if (strEqual (currAlteration->type,"multiExonHit") || strEqual (currAlteration->type,"spliceOverlap") ||
                 strEqual (currAlteration->type,"startOverlap") || strEqual (currAlteration->type,"endOverlap")) {
          stringAppendf (disabledTranscripts,"%s:%s:%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength);
        }
        else {
          stringAppendf (disabledTranscripts,"%s:%s:%d_%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition);
        }
        numDisabledTranscripts = 1;
        j = i + 1;
        while (j < arrayMax (alterations)) {
          nextAlteration = arrp (alterations,j,Alteration);
          if (strEqual (currAlteration->geneId,nextAlteration->geneId) && 
              strEqual (currAlteration->type,nextAlteration->type)) {
            if (nextAlteration->substitution[0] != '\0') {
              stringAppendf (disabledTranscripts,":%s:%s:%d_%d_%s",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition,nextAlteration->substitution);
            }
            else if (strEqual (nextAlteration->type,"multiExonHit") || strEqual (nextAlteration->type,"spliceOverlap") ||
                     strEqual (nextAlteration->type,"startOverlap") || strEqual (nextAlteration->type,"endOverlap")) {
              stringAppendf (disabledTranscripts,":%s:%s:%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength);
            }
            else {
              stringAppendf (disabledTranscripts,":%s:%s:%d_%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition);
            }
            numDisabledTranscripts++;
          }
          else {
            break;
          }
          j++;
        }
        i = j;
        geneTranscriptIds = util_getTranscriptIdsForGeneId (geneTranscriptEntries,currAlteration->geneId);
        stringAppendf (buffer,":%d/%d:%s",numDisabledTranscripts,arrayMax (geneTranscriptIds),string (disabledTranscripts));
      }
      if (flag1 == 0) {
        printf ("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s;VA=",
                currVcfEntry->chromosome,currVcfEntry->position,currVcfEntry->id,
                currVcfEntry->referenceAllele,currVcfEntry->alternateAllele,
                currVcfEntry->quality,currVcfEntry->filter,currVcfEntry->info);
        flag1 = 1;
      }
      printf ("%s%s",flag2 == 1 ? "," : "",string (buffer)); 
      flag2 = 1;
    }
    if (flag1 == 1) {
      for (i = 0; i < arrayMax (currVcfEntry->genotypes); i++) {
        currVcfGenotype = arrp (currVcfEntry->genotypes,i,VcfGenotype);
        if (i == 0) {
          printf ("\t%s\t",currVcfEntry->genotypeFormat);
        }
        printf ("%s%s%s%s",currVcfGenotype->genotype,
                currVcfGenotype->details[0] != '\0' ? ":" : "",
                currVcfGenotype->details[0] != '\0' ?  currVcfGenotype->details : "",
                i < arrayMax (currVcfEntry->genotypes) - 1 ? "\t" : ""); 
      }
      puts ("");
    }
  }
  vcf_deInit ();
  return 0;
}
Exemple #6
0
int main(int argc, char *argv[])
{
	Array breakPoints;
	BreakPoint *currBP;
	BreakPointRead *currBPR;
	int minNumReads, minNumUniqueOffsets,
	    minNumReadsForKS,numPossibleOffsets;
	double pValueCutoffForKS;
	Array offsets;
	Array randomNumbers;
	double *observedOffsets;
	double *randomOffsets;

 	if (argc != 6) {
		usage((char*) "%s <minNumReads> <minNumUniqueOffsets> "
              "<minNumReadsForKS> <pValueCutoffForKS> <numPossibleOffsets>", 
              argv[0]);
    }
	
	minNumReads         = std::atoi(argv[1]);
	minNumUniqueOffsets = std::atoi(argv[2]);
	minNumReadsForKS    = std::atoi(argv[3]);
	pValueCutoffForKS   = std::atof(argv[4]);
	numPossibleOffsets  = std::atoi(argv[5]);
	bp_init("-");
	offsets = arrayCreate(100, int);
	randomNumbers = arrayCreate(100, int);
	breakPoints = bp_getBreakPoints();

	for (int i = 0; i < arrayMax(breakPoints); i++) {
		currBP = arrp(breakPoints, i, BreakPoint);
		arrayClear(offsets);
		for (int j = 0; j < arrayMax(currBP->breakPointReads); j++) {
			currBPR = arrp(currBP->breakPointReads, j, BreakPointRead);
			array(offsets, arrayMax(offsets), int) = currBPR->offset;
		}
		arraySort(offsets, (ARRAYORDERF) arrayIntcmp);
		arrayUniq(offsets, NULL, (ARRAYORDERF) arrayIntcmp);
		if (arrayMax(currBP->breakPointReads) >= minNumReads && 
		    arrayMax(currBP->breakPointReads) < minNumReadsForKS) {        
			if (arrayMax(offsets) >= minNumUniqueOffsets)
				std::puts(bp_writeBreakPoint(currBP));
		}
		else if (arrayMax(currBP->breakPointReads) >= minNumReads && 
			 arrayMax(currBP->breakPointReads) >= minNumReadsForKS) {
			arrayClear(randomNumbers);
			for (int j = 0; j < arrayMax(offsets); j++)
				array(randomNumbers, arrayMax(randomNumbers), int) = std::rand() % numPossibleOffsets;
			
			arraySort(randomNumbers, (ARRAYORDERF) arrayIntcmp);
			observedOffsets = (double *) hlr_malloc(arrayMax(offsets) * sizeof(double)); 
			randomOffsets = (double *) hlr_malloc(arrayMax(offsets) * sizeof(double)); 
			for (int j = 0; j < arrayMax(offsets); j++) {
				observedOffsets[j] = arru(offsets, j, int);
				randomOffsets[j] = arru(randomNumbers, j, int);
			}
			if (pValueCutoffForKS < TMath::KolmogorovTest(arrayMax(offsets), 
								      observedOffsets, 
								      arrayMax(offsets), 
								      randomOffsets, 
								      ""))
				std::puts(bp_writeBreakPoint(currBP));
			
			hlr_free(observedOffsets);
			hlr_free(randomOffsets);
		}
	}