Exemple #1
0
int main()
{
	int numbers[1000];
    int i;

    for(i=0;i<1000;i++)
    {
        int number;
        scanf("%d",&number);
        if(number == 42)
        {
            break;
        }
        else
        {
            numbers[i] = number;
        }
    }

    int arrayLenght = i;
    arraySort(numbers,arrayLenght);

    for(i=0;i<arrayLenght;i++)
    {
        printf("%d\n",numbers[i]);
    }
	return 0;
}
Exemple #2
0
int main (int argc, char *argv[])
{
  Array breakPoints;
  BreakPoint *currBP;
  int i;
  char *breakPointSequence;

  if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL)
    return EXIT_FAILURE;

  bp_init ("-");
  breakPoints = bp_getBreakPoints ();
  arraySort (breakPoints,(ARRAYORDERF)sortBreakPointsByTargetAndOffset);
  
  for (i = 0; i < arrayMax (breakPoints); i++) {
    currBP = arrp (breakPoints,i,BreakPoint);
    breakPointSequence = getBreakPointSequence (currBP->tileCoordinate1,currBP->tileCoordinate2);
    printf( ">%s|%s\n%s\n", currBP->tileCoordinate1, currBP->tileCoordinate2, breakPointSequence);
    warn(">%s|%s\n%s", 
	 currBP->tileCoordinate1, 
	 currBP->tileCoordinate2, 
	 subString(breakPointSequence, 10, strlen(breakPointSequence)-10));
  }
  bp_deInit();
  confp_close(Conf);

  return EXIT_SUCCESS;
}
static void obtainPairCounts (GfrEntry *currGE)
{
	GfrPairCount *currPC;
	GfrInterRead *currGIR,*nextGIR;
	int i,j;

	currGE->pairCounts = arrayCreate (100,GfrPairCount);
	arraySort (currGE->interReads,(ARRAYORDERF)sortGfrInterReads);
	i = 0;
	while (i < arrayMax (currGE->interReads)) {
		currGIR = arrp (currGE->interReads,i,GfrInterRead);
		currPC = arrayp (currGE->pairCounts,arrayMax (currGE->pairCounts),GfrPairCount);
		currPC->number1 = currGIR->number1;
		currPC->number2 = currGIR->number2;
		currPC->pairType = currGIR->pairType;
		currPC->count = 1;
		j = i + 1;
		while (j < arrayMax (currGE->interReads)) {
			nextGIR = arrp (currGE->interReads,j,GfrInterRead);
			if (currGIR->pairType == nextGIR->pairType && currGIR->number1==nextGIR->number1 && currGIR->number2==nextGIR->number2) {
				currPC->count++;
			}
			else {
				break;
			}
			j++;
		}
		i = j;
	}
}
Exemple #4
0
int main (int argc, char *argv[])
{
	GfrEntry *currGE;
	int count;
	int countRemoved;
	int i;

	if (argc != 3) {
		usage ("%s <offsetCutoff> <minNumUniqueReads>",argv[0]);
	}
	count = 0;
	countRemoved = 0;

	int offsetCutOff = atoi (argv[1]);
	int minNumUniqueReads = atoi (argv[2]);

	gfr_init ("-");
	puts (gfr_writeHeader ());
	while (currGE = gfr_nextEntry ()) {
		Array starts = arrayCreate( 100, int);
		for (i = 0; i < arrayMax( currGE->interReads ); i++) {
			int currStart = arrp(currGE->interReads, i, GfrInterRead)->readStart1 + arrp(currGE->interReads, i, GfrInterRead)->readStart2;
			array(starts, arrayMax(starts), int) = currStart; 
		}
		arraySort( starts, (ARRAYORDERF) arrayIntcmp );
		arrayUniq( starts, NULL, (ARRAYORDERF) arrayIntcmp ) ;
		int numUniqeOffsets = arrayMax( starts );
		arrayDestroy( starts );

	if (arrayMax( currGE->readsTranscript1 ) != arrayMax( currGE->readsTranscript2 ) )
		die( "The two ends have a different number of reads");
	Texta reads = textCreate(arrayMax(currGE->readsTranscript1));
	for (i = 0; i < arrayMax(currGE->readsTranscript1); i++) {
		Stringa strA = stringCreate( strlen(textItem( currGE->readsTranscript1, i) ) * 2 + 1);
		stringAppendf( strA, textItem( currGE->readsTranscript1,i));
		stringAppendf( strA, textItem( currGE->readsTranscript2,i)); 
		textAdd( reads, string(strA));
		stringDestroy( strA );
	}
	textUniqKeepOrder( reads );
	int numRemaining = arrayMax( reads );
	textDestroy ( reads );

	if (numRemaining <= minNumUniqueReads || numUniqeOffsets <= offsetCutOff) {
		countRemoved++;
		continue;
	} 
	puts (gfr_writeGfrEntry (currGE));
	count++;
	}
	gfr_deInit ();
	warn("%s_PCRFilter: offset=%d minNumUniqueReads=%d",
	     argv[0],offsetCutOff, minNumUniqueReads);
	warn("%s_numRemoved: %d",argv[0],countRemoved);
	warn("%s_numGfrEntries: %d",argv[0],count);
	return 0;
}
Exemple #5
0
int main (int argc, char *argv[])
{
  int i,j,groupNumber;
  MrfEntry *currEntry;
  GffEntry *currGffEntry,*nextGffEntry;
  Array gffEntries;
  FILE *fp;
  Stringa buffer;
  short int paired;

  if (argc != 2) {
    usage ("%s <prefix>",argv[0]);
  }
  buffer = stringCreate (1000);
  groupNumber = 0;
  mrf_init ("-");
  gffEntries = arrayCreate (100000,GffEntry);
  while (currEntry = mrf_nextEntry ()) {
    processRead (gffEntries, currEntry, &groupNumber);
  }
  mrf_deInit ();

  arraySort (gffEntries,(ARRAYORDERF)sortGffEntriesByTargetNameAndGroupNumber);
  i = 0; 
  while (i < arrayMax (gffEntries)) {
    currGffEntry = arrp (gffEntries,i,GffEntry);
    stringPrintf (buffer,"%s_%s.gff",argv[1],currGffEntry->targetName);
    fp = fopen (string (buffer),"w");
    if (fp == NULL) {
      die ("Unable to open file: %s",string (buffer));
    }
    fprintf (fp,"browser hide all\n");
    fprintf (fp,"track name=\"%s_%s\" visibility=2\n",argv[1],currGffEntry->targetName);
    fprintf (fp,"%s\n",currGffEntry->line);
    j = i + 1;
    while (j < arrayMax (gffEntries)) {
      nextGffEntry = arrp (gffEntries,j,GffEntry);
      if (!strEqual (currGffEntry->targetName,nextGffEntry->targetName)) {
        break;
      } 
      fprintf (fp,"%s\n",nextGffEntry->line);
      j++;
    }
    i = j;
    fclose (fp);
  }
  stringDestroy (buffer);
  return 0;
}
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  Array kgTreeFams;
  Stringa buffer;
  int count;
  int countRemoved;

  config *conf;

  if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL)
    return EXIT_FAILURE;

  buffer = stringCreate (100);
  stringPrintf (buffer,"%s/%s",
                confp_get(conf, "ANNOTATION_DIR"), 
		confp_get(conf, "KNOWN_GENE_TREE_FAM_FILENAME"));
  kgTreeFams = util_readKnownGeneTreeFams (string (buffer));
  arraySort (kgTreeFams,(ARRAYORDERF)sortKgTreeFamsByTranscriptName);
  stringDestroy (buffer);

  count = 0;
  countRemoved = 0;
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()){
    if (isHomologous (kgTreeFams,currGE->nameTranscript1,currGE->nameTranscript2)) {
      countRemoved++;
      continue;
    }
    puts (gfr_writeGfrEntry (currGE));
    count++;
  }
  gfr_deInit ();
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);

  confp_close(conf);

  return EXIT_SUCCESS;
}
/**
 * \file bgrQuantifier <annotation.interval>.
 * \pre: it requires a BedGraph file from STDIN normalized by the number of mapped nucleotides
 */
int main( int argc, char* argv[] ) {
  Array bgrs;
  Array intervals;
  Array entries;
  int i, j, length;
  double value;
  if( argc < 2 ) {
    usage("%s <annotation.interval>\n%s requires a BedGraph from STDIN", argv[0], argv[0]);
  }
  bgrs = arrayCreate( 1000, BedGraph );
  bgrParser_initFromFile ( "-" );
  bgrs = bgrParser_getAllEntries ();
  bgrParser_deInit();
  arraySort( bgrs, (ARRAYORDERF) bgrParser_sort );
  
  intervalFind_addIntervalsToSearchSpace ( argv[1], 0 );
  intervals = intervalFind_getAllIntervals ();
  
  for( i=0; i<arrayMax(intervals); i++ ) {
    Interval *currInterval = arrp( intervals, i, Interval );
    length = currInterval->end - currInterval->start;
    entries = bgrParser_getValuesForRegion( bgrs, currInterval->chromosome, currInterval->start, currInterval->end);
    value = 0.0;
    for( j=0; j<arrayMax( entries ); j++) 
      value += arru( entries, j, double );
    
    printf("%s\t%s:%d-%d\t%f\n", currInterval->name, 
	   currInterval->chromosome, 
	   currInterval->start+1, 
	   currInterval->end, 
	   value /= length / 1000.0 );       
    arrayDestroy( entries );
  }
  arrayDestroy( intervals );
  return 0;
}
int main (int argc, char *argv[])
{
	Array kgXrefs;
	Stringa buffer;
	LineStream ls;
	int count=0;
	char* geneSymbolTranscript;
	char* descriptionTranscript;
	char* line;
	char* exonID = NULL;

	config *conf;

	if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL)
		return EXIT_FAILURE;

	buffer = stringCreate (100);

	stringPrintf (buffer,"%s/%s",
		      confp_get(conf, "ANNOTATION_DIR"),
		      confp_get(conf, "KNOWN_GENE_XREF_FILENAME"));
	kgXrefs = util_readKnownGeneXrefs (string (buffer));
	arraySort (kgXrefs,(ARRAYORDERF)sortKgXrefsByTranscriptName);
	stringDestroy (buffer);

	//  gfr_init ("-");
	 ls = ls_createFromFile("-");
  
	while (line = ls_nextLine(ls)) {
		char *lineP = hlr_strdup(line);
		WordIter w = wordIterCreate( line, "\t", 0);
		char *nameTranscript = wordNext( w );
		char *p = rindex(nameTranscript, '_');
		if (p) {
			exonID = hlr_strdup( p+1 );
			*p='\0';
		}
		transcript2geneSymbolAndGeneDescription(kgXrefs,
							nameTranscript,
							&geneSymbolTranscript,
							&descriptionTranscript);
		if (exonID) {
			printf("%s_%s\t%s\t%s\t%s", 
				nameTranscript, 
				exonID,
				geneSymbolTranscript, 
				exonID, 
				descriptionTranscript);
			hlr_free(exonID);
		} else {
			printf("%s\t%s\t1\t%s", 
				nameTranscript, 
			
	geneSymbolTranscript, 
				descriptionTranscript);
		}
		printf("%s\n", lineP+strlen(nameTranscript));
		count++;
		hlr_free(lineP);
		wordIterDestroy(w);
	}
	ls_destroy (ls);
	warn ("%s_numGfrEntries: %d",argv[0],count);
	confp_close(conf);

	return EXIT_SUCCESS;
}
Exemple #9
0
int main (int argc, char *argv[])
{
  Array intervals;
  Interval *currInterval;
  SubInterval *currSubInterval;
  int refLength,altLength,offset;
  int h,i,j;
  Stringa buffer;
  Array geneTranscriptEntries;
  Texta geneTranscriptIds;
  Array alterations;
  Alteration *currAlteration,*nextAlteration;
  int numTranscripts;
  Stringa transcripts;
  VcfEntry *currVcfEntry;
  int position;
  Texta alternateAlleles;
  int flag1,flag2;
  VcfGenotype *currVcfGenotype;
 
  if (argc != 3) {
    usage ("%s <annotation.interval> <nameFeature>",argv[0]);
  }
  intervalFind_addIntervalsToSearchSpace (argv[1],0);
  geneTranscriptEntries = util_getGeneTranscriptEntries (intervalFind_getAllIntervals ());
  buffer = stringCreate (100);
  transcripts = stringCreate (100);
  alterations = arrayCreate (100,Alteration);
  vcf_init ("-");
  stringPrintf (buffer,"##INFO=<ID=VA,Number=.,Type=String,Description=\"Variant Annotation, %s, %s\">",argv[1],argv[2]);
  vcf_addComment (string (buffer));
  puts (vcf_writeMetaData ());
  puts (vcf_writeColumnHeaders ());
  while (currVcfEntry = vcf_nextEntry ()) {
    if (vcf_isInvalidEntry (currVcfEntry)) {
      continue;
    }
    flag1 = 0;
    flag2 = 0;
    position = currVcfEntry->position - 1; // make zero-based
    alternateAlleles = vcf_getAlternateAlleles (currVcfEntry);
    for (h = 0; h < arrayMax (alternateAlleles); h++) {
      refLength = strlen (currVcfEntry->referenceAllele);
      altLength = strlen (textItem (alternateAlleles,h));
      offset = MAX (refLength,altLength) - 1; 
      util_clearAlterations (alterations);
      intervals = intervalFind_getOverlappingIntervals (currVcfEntry->chromosome,position,position + offset);
      for (i = 0; i < arrayMax (intervals); i++) {
        currInterval = arru (intervals,i,Interval*);
        j = 0; 
        while (j < arrayMax (currInterval->subIntervals)) {
          currSubInterval = arrp (currInterval->subIntervals,j,SubInterval);
          if (currSubInterval->start <= position && (position + offset) < currSubInterval->end) {
            break;
          }
          j++;
        }
        if (j == arrayMax (currInterval->subIntervals)) {
          continue;
        }
        util_addAlteration (arrayp (alterations,arrayMax (alterations),Alteration),currInterval->name,argv[2],currInterval,position,0);
      }
      if (arrayMax (alterations) == 0) {
        continue;
      }
      arraySort (alterations,(ARRAYORDERF)util_sortAlterationsByGeneIdAndType);
      stringClear (buffer);
      i = 0;
      while (i < arrayMax (alterations)) {
        currAlteration = arrp (alterations,i,Alteration);
        stringAppendf (buffer,"%s%d:%s:%s:%c:%s",stringLen (buffer) == 0 ? "" : "|",h + 1,currAlteration->geneName,currAlteration->geneId,currAlteration->strand,currAlteration->type);
        stringClear (transcripts);
        stringAppendf (transcripts,"%s:%s:%d_%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition);
        numTranscripts = 1;
        j = i + 1;
        while (j < arrayMax (alterations)) {
          nextAlteration = arrp (alterations,j,Alteration);
          if (strEqual (currAlteration->geneId,nextAlteration->geneId) && 
              strEqual (currAlteration->type,nextAlteration->type)) {
            stringAppendf (transcripts,":%s:%s:%d_%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition);
            numTranscripts++;
          }
          else {
            break;
          }
          j++;
        }
        i = j;
        geneTranscriptIds = util_getTranscriptIdsForGeneId (geneTranscriptEntries,currAlteration->geneId);
        stringAppendf (buffer,":%d/%d:%s",numTranscripts,arrayMax (geneTranscriptIds),string (transcripts));
      }
      if (flag1 == 0) {
        printf ("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s;VA=",
                currVcfEntry->chromosome,currVcfEntry->position,currVcfEntry->id,
                currVcfEntry->referenceAllele,currVcfEntry->alternateAllele,
                currVcfEntry->quality,currVcfEntry->filter,currVcfEntry->info);
        flag1 = 1;
      }
      printf ("%s%s",flag2 == 1 ? "," : "",string (buffer)); 
      flag2 = 1;
    }
    if (flag1 == 1) {
      for (i = 0; i < arrayMax (currVcfEntry->genotypes); i++) {
        currVcfGenotype = arrp (currVcfEntry->genotypes,i,VcfGenotype);
        if (i == 0) {
          printf ("\t%s\t",currVcfEntry->genotypeFormat);
        }
        printf ("%s%s%s%s",currVcfGenotype->genotype,
                currVcfGenotype->details[0] != '\0' ? ":" : "",
                currVcfGenotype->details[0] != '\0' ?  currVcfGenotype->details : "",
                i < arrayMax (currVcfEntry->genotypes) - 1 ? "\t" : ""); 
      }
      puts ("");
    }
  }
  vcf_deInit ();
  return 0;
}
double WindscreenLocator::judgeVerticalBorder(int size, int* topList, int topNr, int intervalA, int intervalB, int& xa, int& xb)
{
    // increase order
    arraySort(topList, topNr);
    int interval = intervalB - intervalA;
    int inner_reg = interval * 0.07;
    int outer_reg_1 = interval * 0.5 / 2;
    int outer_reg_2 = interval * 0.7 / 2;

    // outer_reg_1
    int a_start = intervalA - outer_reg_1;
    int a_end = intervalA + inner_reg;
    a_start = std::max(0, a_start);

    int b_end = intervalB + outer_reg_1;
    int b_start = intervalB - inner_reg;
    b_end = std::min(b_end, size - 1);

    xa = -1;
    xb = -1;

    for(int i = 0; i < topNr; i++){
        if(topList[i] >= a_start && topList[i] <= intervalA){
            xa = topList[i];
        }
        if(topList[topNr-i-1] >= intervalB && topList[topNr-i-1] <= b_end){
            xb = topList[topNr-i-1];
        }
    }

    if(xa == -1){
        for(int i = topNr - 1; i >= 0; i--){
            if(topList[i] >= intervalA && topList[i] <= a_end){
                xa = topList[i];
            }
        }
    }
    if(xb == -1){
        for(int i = 0; i < topNr; i++){
            if(topList[i] >= b_start && topList[i] <= intervalB){
                xb = topList[i];
            }
        }
    }
    a_end = a_start;
    a_start = intervalA - outer_reg_2;
    a_start = std::max(0, a_start);
    b_start = b_end;
    b_end = intervalB + outer_reg_2;
    b_end = std::min(b_end, size - 1);

    if(xa == -1){
        for(int i = 0; i < topNr; i++){
            if(topList[i] >= a_start && topList[i] <= a_end){
                xa = topList[i];
            }
        }
    }
    if(xb == -1){
        for(int i = 0; i < topNr; i++){
            if(topList[topNr-i-1] >= b_start && topList[topNr-i-1] <= b_end){
                xb = topList[topNr-i-1];
            }
        }
    }
    if(xa == -1)
        xa = intervalA;
    if(xb == -1)
        xb = intervalB;
    return 0.0;
}
Exemple #11
0
int main (int argc, char *argv[])
{
  Array intervals;
  Interval *currInterval;
  SubInterval *currSubInterval;
  int h,i,j;
  Array seqs;
  Seq *currSeq,testSeq;
  int index;
  Stringa buffer;
  Array geneTranscriptEntries;
  Texta geneTranscriptIds;
  Array alterations;
  Alteration *currAlteration,*nextAlteration;
  char *proteinSequenceBeforeIndel;
  char *proteinSequenceAfterIndel;
  int numDisabledTranscripts;
  Stringa disabledTranscripts;
  int seqLength,refLength,altLength;
  char *sequenceBeforeIndel = NULL;
  int overlapMode;
  int numOverlaps;
  int sizeIndel,indelOffset;
  int overlap;
  Array coordinates;
  VcfEntry *currVcfEntry;
  VcfGenotype *currVcfGenotype;
  int position;
  Texta alternateAlleles;
  int flag1,flag2;
  
  if (argc != 3) {
    usage ("%s <annotation.interval> <annotation.fa>",argv[0]);
  }
  intervalFind_addIntervalsToSearchSpace (argv[1],0);
  geneTranscriptEntries = util_getGeneTranscriptEntries (intervalFind_getAllIntervals ());
  seq_init ();
  fasta_initFromFile (argv[2]);
  seqs = fasta_readAllSequences (0);
  fasta_deInit ();
  arraySort (seqs,(ARRAYORDERF)util_sortSequencesByName); 
  buffer = stringCreate (100);
  disabledTranscripts = stringCreate (100);
  alterations = arrayCreate (100,Alteration);
  vcf_init ("-");
  stringPrintf (buffer,"##INFO=<ID=VA,Number=.,Type=String,Description=\"Variant Annotation, %s\">",argv[1]);
  vcf_addComment (string (buffer));
  puts (vcf_writeMetaData ());
  puts (vcf_writeColumnHeaders ());
  while (currVcfEntry = vcf_nextEntry ()) {
    if (vcf_isInvalidEntry (currVcfEntry)) {
      continue;
    }
    flag1 = 0;
    flag2 = 0;
    position = currVcfEntry->position - 1; // make zero-based
    alternateAlleles = vcf_getAlternateAlleles (currVcfEntry);
    for (h = 0; h < arrayMax (alternateAlleles); h++) {
      refLength = strlen (currVcfEntry->referenceAllele);
      altLength = strlen (textItem (alternateAlleles,h));
      sizeIndel = abs (refLength - altLength);
      indelOffset = MAX (refLength,altLength) - 1; 
      util_clearAlterations (alterations);
      intervals = intervalFind_getOverlappingIntervals (currVcfEntry->chromosome,position,position + indelOffset);
      for (i = 0; i < arrayMax (intervals); i++) {
        currInterval = arru (intervals,i,Interval*);
        overlapMode = OVERLAP_NONE;
        numOverlaps = 0;
        for (j = 0; j < arrayMax (currInterval->subIntervals); j++) {
          currSubInterval = arrp (currInterval->subIntervals,j,SubInterval);
          overlap = rangeIntersection (position,position + indelOffset,currSubInterval->start,currSubInterval->end);
          if (currSubInterval->start <= position && (position + indelOffset) < currSubInterval->end) {
            overlapMode = OVERLAP_FULLY_CONTAINED;
            numOverlaps++;
          }
          else if (j == 0 && overlap > 0 && position < currSubInterval->start) {
            overlapMode = OVERLAP_START;
            numOverlaps++;
          }
          else if (j == (arrayMax (currInterval->subIntervals) - 1) && overlap > 0 && (position + indelOffset) >= currSubInterval->end) {
            overlapMode = OVERLAP_END;
            numOverlaps++;
          }
          else if (overlap > 0 && overlap <= indelOffset) {
            overlapMode = OVERLAP_SPLICE;
            numOverlaps++;
          }
        }
        if (overlapMode == OVERLAP_NONE) {
          continue;
        }
        currAlteration = arrayp (alterations,arrayMax (alterations),Alteration);
        if (numOverlaps > 1) {
          util_addAlteration (currAlteration,currInterval->name,"multiExonHit",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_SPLICE) {
          util_addAlteration (currAlteration,currInterval->name,"spliceOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_START) {
          util_addAlteration (currAlteration,currInterval->name,"startOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_END) {
          util_addAlteration (currAlteration,currInterval->name,"endOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength > refLength) {
          if ((sizeIndel % 3) == 0) {
            util_addAlteration (currAlteration,currInterval->name,"insertionNFS",currInterval,position,0);
          }
          else {
            util_addAlteration (currAlteration,currInterval->name,"insertionFS",currInterval,position,0);
          }
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength < refLength) {
          if ((sizeIndel % 3) == 0) {
            util_addAlteration (currAlteration,currInterval->name,"deletionNFS",currInterval,position,0);
          }
          else {
            util_addAlteration (currAlteration,currInterval->name,"deletionFS",currInterval,position,0);
          }
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength == refLength) {
          util_addAlteration (currAlteration,currInterval->name,"substitution",currInterval,position,0);
        }
        else {
          die ("Unexpected type: %d %s %s %s",
               currVcfEntry->position,currVcfEntry->chromosome,
               currVcfEntry->referenceAllele,currVcfEntry->alternateAllele);
        }
        if ((sizeIndel % 3) != 0 && altLength != refLength) { 
          continue;
        }
        // Only run the remaining block of code if the indel is fully contained (insertion or deletion) AND does not cause a frameshift OR
        // if it is a substitution that is fully contained in the coding sequence
        stringPrintf (buffer,"%s|%s|%c|",currInterval->name,currInterval->chromosome,currInterval->strand);
        for (j = 0; j < arrayMax (currInterval->subIntervals); j++) {
          currSubInterval = arrp (currInterval->subIntervals,j,SubInterval);
          stringAppendf (buffer,"%d|%d%s",currSubInterval->start,currSubInterval->end,j < arrayMax (currInterval->subIntervals) - 1 ? "|" : "");
        }
        testSeq.name = hlr_strdup (string (buffer));
        if (!arrayFind (seqs,&testSeq,&index,(ARRAYORDERF)util_sortSequencesByName)) {
          die ("Expected to find %s in seqs",string (buffer));
        }
        hlr_free (testSeq.name);
        currSeq = arrp (seqs,index,Seq);
        strReplace (&sequenceBeforeIndel,currSeq->sequence);
        seqLength = strlen (sequenceBeforeIndel); 
        coordinates = util_getCoordinates (currInterval);
        // arraySort (coordinates,(ARRAYORDERF)util_sortCoordinatesByChromosomeAndTranscriptPosition); Array is already sorted by definition
        j = 0;
        stringClear (buffer);
        while (j < seqLength) {
          if (util_getGenomicCoordinate (coordinates,j,currVcfEntry->chromosome) == position) {
            if (altLength > refLength) {
              stringCat (buffer,textItem (alternateAlleles,h));
              j++;
              continue;
            }
            else if (altLength < refLength) {
              stringCatChar (buffer,sequenceBeforeIndel[j]);
              j = j + refLength - altLength + 1;
              continue;
            }
            else {
              stringCat (buffer,textItem (alternateAlleles,h));
              j = j + altLength;
              continue;
            }
          }
          stringCatChar (buffer,sequenceBeforeIndel[j]);
          j++;
        }
        util_destroyCoordinates (coordinates);
        proteinSequenceBeforeIndel = hlr_strdup (util_translate (currInterval,sequenceBeforeIndel));
        proteinSequenceAfterIndel = hlr_strdup (util_translate (currInterval,string (buffer)));
        addSubstitution (currAlteration,proteinSequenceBeforeIndel,proteinSequenceAfterIndel,indelOffset);
        hlr_free (proteinSequenceBeforeIndel);
        hlr_free (proteinSequenceAfterIndel);
      }
      if (arrayMax (alterations) == 0) {
        continue;
      }
      arraySort (alterations,(ARRAYORDERF)util_sortAlterationsByGeneIdAndType);
      stringClear (buffer);
      i = 0;
      while (i < arrayMax (alterations)) {
        currAlteration = arrp (alterations,i,Alteration);
        stringAppendf (buffer,"%s%d:%s:%s:%c:%s",stringLen (buffer) == 0 ? "" : ",",h + 1,currAlteration->geneName,currAlteration->geneId,currAlteration->strand,currAlteration->type);
         stringClear (disabledTranscripts);
        if (currAlteration->substitution[0] != '\0') {
          stringAppendf (disabledTranscripts,"%s:%s:%d_%d_%s",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition,currAlteration->substitution);
        }
        else if (strEqual (currAlteration->type,"multiExonHit") || strEqual (currAlteration->type,"spliceOverlap") ||
                 strEqual (currAlteration->type,"startOverlap") || strEqual (currAlteration->type,"endOverlap")) {
          stringAppendf (disabledTranscripts,"%s:%s:%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength);
        }
        else {
          stringAppendf (disabledTranscripts,"%s:%s:%d_%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition);
        }
        numDisabledTranscripts = 1;
        j = i + 1;
        while (j < arrayMax (alterations)) {
          nextAlteration = arrp (alterations,j,Alteration);
          if (strEqual (currAlteration->geneId,nextAlteration->geneId) && 
              strEqual (currAlteration->type,nextAlteration->type)) {
            if (nextAlteration->substitution[0] != '\0') {
              stringAppendf (disabledTranscripts,":%s:%s:%d_%d_%s",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition,nextAlteration->substitution);
            }
            else if (strEqual (nextAlteration->type,"multiExonHit") || strEqual (nextAlteration->type,"spliceOverlap") ||
                     strEqual (nextAlteration->type,"startOverlap") || strEqual (nextAlteration->type,"endOverlap")) {
              stringAppendf (disabledTranscripts,":%s:%s:%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength);
            }
            else {
              stringAppendf (disabledTranscripts,":%s:%s:%d_%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition);
            }
            numDisabledTranscripts++;
          }
          else {
            break;
          }
          j++;
        }
        i = j;
        geneTranscriptIds = util_getTranscriptIdsForGeneId (geneTranscriptEntries,currAlteration->geneId);
        stringAppendf (buffer,":%d/%d:%s",numDisabledTranscripts,arrayMax (geneTranscriptIds),string (disabledTranscripts));
      }
      if (flag1 == 0) {
        printf ("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s;VA=",
                currVcfEntry->chromosome,currVcfEntry->position,currVcfEntry->id,
                currVcfEntry->referenceAllele,currVcfEntry->alternateAllele,
                currVcfEntry->quality,currVcfEntry->filter,currVcfEntry->info);
        flag1 = 1;
      }
      printf ("%s%s",flag2 == 1 ? "," : "",string (buffer)); 
      flag2 = 1;
    }
    if (flag1 == 1) {
      for (i = 0; i < arrayMax (currVcfEntry->genotypes); i++) {
        currVcfGenotype = arrp (currVcfEntry->genotypes,i,VcfGenotype);
        if (i == 0) {
          printf ("\t%s\t",currVcfEntry->genotypeFormat);
        }
        printf ("%s%s%s%s",currVcfGenotype->genotype,
                currVcfGenotype->details[0] != '\0' ? ":" : "",
                currVcfGenotype->details[0] != '\0' ?  currVcfGenotype->details : "",
                i < arrayMax (currVcfEntry->genotypes) - 1 ? "\t" : ""); 
      }
      puts ("");
    }
  }
  vcf_deInit ();
  return 0;
}
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  BLEntry *currBLE;
  BLEntry currQuery;
  FILE *fp;
  char *line;
  int count;
  int countRemoved;
  
  int index;
  WordIter w;
  Array blackList = arrayCreate(20, BLEntry);

  if (argc != 2) {
    usage ("%s <blackList.txt>",argv[0]);
  }  
  fp = fopen( argv[1], "r" );
  
  if( !fp )  die("Unable to open file: %s", argv[1]);
  // reading blacklist file
  LineStream ls = ls_createFromFile( argv[1] );
  while( line = ls_nextLine(ls) ) {
    w = wordIterCreate( line, "\t", 1);
    currBLE = arrayp( blackList, arrayMax(blackList), BLEntry);
    currBLE->gene1 = hlr_strdup ( wordNext(w) );
    currBLE->gene2 = hlr_strdup ( wordNext(w) );    
    wordIterDestroy(w);
  }
  fclose(fp);
  arraySort( blackList, (ARRAYORDERF) sortBlackListByName1);

  // beginFiltering
  count = 0;
  countRemoved = 0;
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()) { // reading the gfr
    // creating a new query to the black list
    currQuery.gene1 = currGE->geneSymbolTranscript1;
    currQuery.gene2 = currGE->geneSymbolTranscript2;
    // searching against read_1/read_2
    int res = arrayFind( blackList, &currQuery, 
			 &index,  (ARRAYORDERF) sortBlackListByName1);  
    
    if( !res ) { // not found, then searching against read_2/read_1
      currQuery.gene1 = currGE->geneSymbolTranscript2;
      currQuery.gene2 = currGE->geneSymbolTranscript1;
      
      res =  arrayFind( blackList, &currQuery, 
			&index, (ARRAYORDERF) sortBlackListByName1 );
      
      if( !res ) { // not found, write the instance to stdout, update the counts
	puts (gfr_writeGfrEntry (currGE));
	count++;	
      } else { // found: read2/read1
	countRemoved++;
      }	
    } else { //found: read1/read2
      countRemoved++;
    }
  }	           
  gfr_deInit ();
  arrayDestroy( blackList );
  warn ("%s_BlackListFilter: %s",argv[0], argv[1]);
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);
  return 0;
}
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  BLEntry *currBLE;
  BLEntry currQuery;
  FILE *fp;
  char *line;
  int count;
  int countRemoved;
  
  int index;
  WordIter w;
  Array blackList = arrayCreate(20, BLEntry);
  config *Conf;

  if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) {
    die("%s:\tCannot find .fusionseqrc: %s", argv[0], getenv("FUSIONSEQ_CONFPATH"));
    return EXIT_FAILURE;
  }
  if( confp_get( Conf, "ANNOTATION_DIR")==NULL ) {
    die("%s:\tCannot find ANNOTATION_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  if( confp_get( Conf, "BLACKLIST_FILENAME")==NULL ) {
    die("%s:\tCannot find BLACKLIST_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  Stringa buffer=stringCreate( 100 );
  stringPrintf( buffer, "%s/%s", confp_get( Conf, "ANNOTATION_DIR"), confp_get( Conf, "BLACKLIST_FILENAME") );
  /*  fp = fopen( string( buffer ), "r" );
  if( !fp )  die("Unable to open file: %s", string(buffer));
  stringDestroy( buffer );
  */ 
// reading blacklist file
  LineStream ls = ls_createFromFile( string(buffer) );
  while( line = ls_nextLine(ls) ) {
    w = wordIterCreate( line, "\t", 1);
    currBLE = arrayp( blackList, arrayMax(blackList), BLEntry);
    currBLE->gene1 = hlr_strdup ( wordNext(w) );
    currBLE->gene2 = hlr_strdup ( wordNext(w) );    
    wordIterDestroy(w);
  }
  //fclose(fp);
  ls_destroy( ls );
  stringDestroy( buffer );
  arraySort( blackList, (ARRAYORDERF) sortBlackListByName1);

  // beginFiltering
  count = 0;
  countRemoved = 0;
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()) { // reading the gfr
    if( currGE->geneSymbolTranscript1 == NULL ) {
      die("Gene symbols are not present in the GFR file. Please run gfrAddInfo before gfrBlackListFilter.");
      return EXIT_FAILURE;
    }
	
    // creating a new query to the black list
    currQuery.gene1 = currGE->geneSymbolTranscript1;
    currQuery.gene2 = currGE->geneSymbolTranscript2;
    if( strEqual( currQuery.gene1 , currQuery.gene2 ) ) {
	countRemoved++;
	continue;
      }
    // searching against read_1/read_2
    int res = arrayFind( blackList, &currQuery, 
			 &index,  (ARRAYORDERF) sortBlackListByName1);  
    
    if( !res ) { // not found, then searching against read_2/read_1
      currQuery.gene1 = currGE->geneSymbolTranscript2;
      currQuery.gene2 = currGE->geneSymbolTranscript1;
      
      res =  arrayFind( blackList, &currQuery, 
			&index, (ARRAYORDERF) sortBlackListByName1 );
      
      if( !res ) { // not found, write the instance to stdout, update the counts
	puts (gfr_writeGfrEntry (currGE));
	count++;	
      } else { // found: read2/read1
	countRemoved++;
      }	
    } else { //found: read1/read2
      countRemoved++;
    }
  }	           
  gfr_deInit ();
  arrayDestroy( blackList );
  warn ("%s_BlackListFilter: %s",argv[0], confp_get( Conf, "BLACKLIST_FILENAME"));
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);
  confp_close( Conf);
  return 0;
}
Exemple #14
0
int main(int argc, char *argv[])
{
	Array breakPoints;
	BreakPoint *currBP;
	BreakPointRead *currBPR;
	int minNumReads, minNumUniqueOffsets,
	    minNumReadsForKS,numPossibleOffsets;
	double pValueCutoffForKS;
	Array offsets;
	Array randomNumbers;
	double *observedOffsets;
	double *randomOffsets;

 	if (argc != 6) {
		usage((char*) "%s <minNumReads> <minNumUniqueOffsets> "
              "<minNumReadsForKS> <pValueCutoffForKS> <numPossibleOffsets>", 
              argv[0]);
    }
	
	minNumReads         = std::atoi(argv[1]);
	minNumUniqueOffsets = std::atoi(argv[2]);
	minNumReadsForKS    = std::atoi(argv[3]);
	pValueCutoffForKS   = std::atof(argv[4]);
	numPossibleOffsets  = std::atoi(argv[5]);
	bp_init("-");
	offsets = arrayCreate(100, int);
	randomNumbers = arrayCreate(100, int);
	breakPoints = bp_getBreakPoints();

	for (int i = 0; i < arrayMax(breakPoints); i++) {
		currBP = arrp(breakPoints, i, BreakPoint);
		arrayClear(offsets);
		for (int j = 0; j < arrayMax(currBP->breakPointReads); j++) {
			currBPR = arrp(currBP->breakPointReads, j, BreakPointRead);
			array(offsets, arrayMax(offsets), int) = currBPR->offset;
		}
		arraySort(offsets, (ARRAYORDERF) arrayIntcmp);
		arrayUniq(offsets, NULL, (ARRAYORDERF) arrayIntcmp);
		if (arrayMax(currBP->breakPointReads) >= minNumReads && 
		    arrayMax(currBP->breakPointReads) < minNumReadsForKS) {        
			if (arrayMax(offsets) >= minNumUniqueOffsets)
				std::puts(bp_writeBreakPoint(currBP));
		}
		else if (arrayMax(currBP->breakPointReads) >= minNumReads && 
			 arrayMax(currBP->breakPointReads) >= minNumReadsForKS) {
			arrayClear(randomNumbers);
			for (int j = 0; j < arrayMax(offsets); j++)
				array(randomNumbers, arrayMax(randomNumbers), int) = std::rand() % numPossibleOffsets;
			
			arraySort(randomNumbers, (ARRAYORDERF) arrayIntcmp);
			observedOffsets = (double *) hlr_malloc(arrayMax(offsets) * sizeof(double)); 
			randomOffsets = (double *) hlr_malloc(arrayMax(offsets) * sizeof(double)); 
			for (int j = 0; j < arrayMax(offsets); j++) {
				observedOffsets[j] = arru(offsets, j, int);
				randomOffsets[j] = arru(randomNumbers, j, int);
			}
			if (pValueCutoffForKS < TMath::KolmogorovTest(arrayMax(offsets), 
								      observedOffsets, 
								      arrayMax(offsets), 
								      randomOffsets, 
								      ""))
				std::puts(bp_writeBreakPoint(currBP));
			
			hlr_free(observedOffsets);
			hlr_free(randomOffsets);
		}
	}