Ejemplo n.º 1
0
/**
 * Returns a pointer to next ElandQuery. 
 * @pre The module has been initialized using elandParser_init().
 * Parse entries of the following format:
   \verbatim
   >FC30H5TAA_100308:2:1:1647:1161	GCTTACATTTTTCCTCTCTACATTATC	U0	1	0	0	chr17.fa	8466296	F	..
   >FC30H5TAA_100308:2:1:1588:122	GAGTTAGCCTTGGGACCCCTACTTCTT	U0	1	0	0	chr3.fa	61525628	F	..
   >FC30H5TAA_100308:2:1:1642:123	GGTGAGAGCCGCGACGGGCTTTAGGCG	NM	0	0	0
   >FC30H5TAA_100308:2:1:1630:119	CCGCCATTGCCAGCCCCCAGCTGACGG	R2	0	0	2
   >FC30H5TAA_100308:2:1:1603:120	GCAAGATGAAGTGAAAGGTAAAGAATC	U1	0	1	1	chrM.fa	15277	R	..	26A
   \endverbatim
 */
ElandQuery* elandParser_nextQuery (void)
{
  WordIter w;
  char *line,*token,*pos;
  static ElandQuery *currElandQuery = NULL;
   
  while (line = ls_nextLine (ls)) {
    if (line[0] == '\0') {
      continue;
    }
    elandParser_freeQuery (currElandQuery);
    currElandQuery = NULL;
    AllocVar (currElandQuery);
    w = wordIterCreate (line,"\t",0);
    currElandQuery->sequenceName = hlr_strdup (wordNext (w) + 1); // remove the '>' character at beginning of the line
    currElandQuery->sequence = hlr_strdup (wordNext (w));
    currElandQuery->matchCode = hlr_strdup (wordNext (w));
    if (strEqual (currElandQuery->matchCode,"QC")) {
      wordIterDestroy (w);
      return currElandQuery;
    }
    currElandQuery->exactMatches = atoi (wordNext (w));
    currElandQuery->oneErrorMatches = atoi (wordNext (w));
    currElandQuery->twoErrorMatches = atoi (wordNext (w));
    token = wordNext (w);
    if (token == NULL) {
      wordIterDestroy (w);
      return currElandQuery;
    }
    if (!(pos = strchr (token,'.'))) {
      die ("Expected '.' in chromosome name: %s",token);
    }
    *pos = '\0';
    currElandQuery->chromosome = hlr_strdup (pos + 1);
    currElandQuery->position = atoi (wordNext (w));
    token = wordNext (w);
    if (token[0] == 'F') {
      currElandQuery->strand = '+'; 
    }
    else if (token[0] == 'R') {
      currElandQuery->strand = '-'; 
    } 
    wordIterDestroy (w);
    return currElandQuery;
  }
  elandParser_freeQuery (currElandQuery);
  currElandQuery = NULL;
  return currElandQuery;
}
Ejemplo n.º 2
0
static void addSubstitution (Alteration *currAlteration, char* proteinSequenceBeforeIndel, char *proteinSequenceAfterIndel, int indelOffset) 
{
  int lengthBefore,lengthAfter;
  static Stringa buffer = NULL;
  int i;
  int diff;
  int index;

  stringCreateClear (buffer,100);
  index = ((currAlteration->relativePosition - 1) / 3);
  lengthBefore = strlen (proteinSequenceBeforeIndel);
  lengthAfter = strlen (proteinSequenceAfterIndel);
  diff = abs (lengthBefore - lengthAfter);
  if (lengthBefore < lengthAfter) {
    stringPrintf (buffer,"%d_%c->",index + 1,proteinSequenceBeforeIndel[index]);
    for (i = 0; i <= diff; i++) {
      stringAppendf (buffer,"%c",proteinSequenceAfterIndel[index + i]);
    }
  }
  else if (lengthBefore > lengthAfter) {
    stringPrintf (buffer,"%d_",index + 1);
    for (i = 0; i <= diff; i++) {
      stringAppendf (buffer,"%c",proteinSequenceBeforeIndel[index + i]);
    }
    stringAppendf (buffer,"->%c",proteinSequenceAfterIndel[index]);
  }
  else {
    stringPrintf (buffer,"%d_%s->",index,subString (proteinSequenceBeforeIndel,index - 1,index + (int)ceil ((double)indelOffset / 3)));
    stringAppendf (buffer,"%s",subString (proteinSequenceAfterIndel,index - 1,index + (int)ceil ((double)indelOffset / 3)));
  } 
  currAlteration->substitution = hlr_strdup (string (buffer));
}
Ejemplo n.º 3
0
int main (int argc, char *argv[])
{
  MrfEntry *currEntry;
  char *targetName;
  int targetStart,targetEnd;
  WordIter w;
  int count=0;
  int currCount=0; 

  if (argc != 2) {
    usage ("%s <targetName:targetStart-targetEnd>",argv[0]);
  }

  w = wordIterCreate (argv[1],":- ",0);
  targetName = hlr_strdup (wordNext (w));
  targetStart = atoi (wordNext (w));
  targetEnd = atoi (wordNext (w));
  wordIterDestroy (w);

  mrf_init ("-");
  while (currEntry = mrf_nextEntry ()) {
    currCount = processEntry (currEntry,targetName,targetStart,targetEnd);
    count = currCount+count;
  }
  printf("Count for %s:%d-%d = %d\n", targetName, targetStart, targetEnd, count);
  mrf_deInit ();
  hlr_free (targetName);
  return 0;
}
Ejemplo n.º 4
0
int main (int argc, char *argv[])
{
  MrfEntry *currEntry;
  char *targetName;
  int targetStart,targetEnd;
  WordIter w;
 
  if (argc != 2) {
    usage ("%s <targetName:targetStart-targetEnd>",argv[0]);
  }
  w = wordIterCreate (argv[1],":- ",0);
  targetName = hlr_strdup (wordNext (w));
  targetStart = atoi (wordNext (w));
  targetEnd = atoi (wordNext (w));
  wordIterDestroy (w);

  mrf_init ("-");
  puts (mrf_writeHeader ());
  while (currEntry = mrf_nextEntry ()) {
    processEntry (currEntry,targetName,targetStart,targetEnd);
  }
  mrf_deInit ();
  hlr_free (targetName);
  return 0;
}
Ejemplo n.º 5
0
Archivo: cgi.c Proyecto: Accio/ribios
SEXP r_cgiParam(SEXP r_param, SEXP ignore_case, SEXP r_default) {
  if(r_param == R_NilValue) return(R_NilValue);

  char* name;
  Stringa value=stringCreate(16);

  char *param=cStr(r_param);
  char *str=NULL;

  SEXP res;
  int (*fPtr)(char*, char*);
  fPtr=cBool(ignore_case) ? &myStrCaseEqual : &myStrEqual;

  cgiGetInit();

  while(name = cgiGetNext(value)) {
    if((*fPtr)(name, param)) {
      str=hlr_strdup(string(value));
      break;
    }
  }
  
  stringDestroy(value);

  if(str) {
    return mkString(str);
  } else {
    return r_default;
  }
}
Ejemplo n.º 6
0
static Fastq* fastq_processNextSequence (int freeMemory, int truncateName)
{
  char *line;
  static Fastq* currFQ = NULL;
  int count;
  Seq* currSeq = NULL;

  if (ls_isEof (lsFastq)) {
    if (freeMemory) {
      fastq_freeFastq (currFQ);
    }
    return NULL;
  }
  count = 0;
  while ( (line=ls_nextLine (lsFastq)) && (count<4) ) {
    if (line[0] == '\0') {
      continue;
    }
    if (line[0] == '@') {      
      if (freeMemory) {
	fastq_freeFastq (currFQ);
      }
      count++;
      AllocVar (currFQ);
      AllocVar (currFQ->seq);
      currSeq = currFQ->seq;
      currSeq->name = hlr_strdup (line + 1);
      if (truncateName) {
	currSeq->name = firstWordInLine (skipLeadingSpaces (currSeq->name));
      }
      line = ls_nextLine (lsFastq); // reading sequence
      currSeq->sequence = hlr_strdup ( line );
      currSeq->size = strlen (currSeq->sequence);
      count++;
      line = ls_nextLine (lsFastq); // reading quality ID
      if( line[0] != '+' )
	die("Expected quality ID: '+' or '+%s'", currSeq->name );
      count++;
      line = ls_nextLine (lsFastq); // reading quality
      currFQ->quality = hlr_strdup( line );
      count++;
    } 
  }   
  ls_back (lsFastq,1);
  return currFQ;
}
Ejemplo n.º 7
0
static void createGffEntry( Array gffEntries, MrfRead *currRead, int groupNumber ) {
  int i;
  MrfBlock *currBlock;
  GffEntry *currGffEntry;
  static Stringa buffer = NULL;
  stringCreateClear (buffer,100);
  for (i = 0; i < arrayMax (currRead->blocks); i++) {
    currBlock = arrp (currRead->blocks,i,MrfBlock);
    currGffEntry = arrayp (gffEntries,arrayMax (gffEntries),GffEntry);
    stringPrintf (buffer,"%s\tMRF\texon\t%d\t%d\t.\t.\t.\tTG%d",
                  currBlock->targetName,
                  currBlock->targetStart,
                  currBlock->targetEnd,
                  //currBlock->strand,
                  groupNumber);
    currGffEntry->targetName = hlr_strdup (currBlock->targetName);
    currGffEntry->line = hlr_strdup (string (buffer));
  }
}
static char* lookUpTreeFam (Array kgTreeFams, char *transcript) 
{
  KgTreeFam testKGTF;
  int index;
  int foundIt;
   
  foundIt = 0;
  testKGTF.transcriptName = hlr_strdup (transcript);
  foundIt = arrayFind (kgTreeFams,&testKGTF,&index,(ARRAYORDERF)sortKgTreeFamsByTranscriptName);
  hlr_free (testKGTF.transcriptName);
  if (foundIt) {
    return  arrp (kgTreeFams,index,KgTreeFam)->treeFamId;
  }
  return NULL;
}
Ejemplo n.º 9
0
/**
 * Get the next BlastQuery.
 * @pre The module has been initialized using blastParser_init().
 */
BlastQuery* blastParser_nextQuery (void)
{
    char *line,*pos;
    static char *queryName = NULL;
    static char *prevBlastQueryName = NULL;
    static BlastQuery *currBlastQuery = NULL;
    int first;

    if (!ls_isEof (ls)) {
        blastParser_freeQuery (currBlastQuery);
        currBlastQuery = NULL;
        AllocVar (currBlastQuery);
        currBlastQuery->entries = arrayCreate (5,BlastEntry);
        first = 1;
        while (line = ls_nextLine (ls)) {
            if (line[0] == '\0') {
                continue;
            }
            pos = strchr (line,'\t');
            *pos = '\0';
            strReplace (&queryName,line);
            if (first == 1 || strEqual (prevBlastQueryName,queryName)) {
                blastParser_processLine (pos + 1,currBlastQuery);
            }
            else {
                ls_back (ls,1);
                return currBlastQuery;
            }
            if (first == 1) {
                currBlastQuery->qName = hlr_strdup (queryName);
                first = 0;
            }
            strReplace(&prevBlastQueryName,queryName);
        }
        if (first == 1) {
            return NULL;
        }
        else {
            return currBlastQuery;
        }
    }
    blastParser_freeQuery (currBlastQuery);
    currBlastQuery = NULL;
    return NULL;
}
Ejemplo n.º 10
0
static void blastParser_processLine (char* line, BlastQuery* currBlastQuery)
{
    WordIter w;
    BlastEntry *currEntry;

    currEntry = arrayp (currBlastQuery->entries,arrayMax (currBlastQuery->entries),BlastEntry);
    w = wordIterCreate (line,"\t",0);
    currEntry->tName = hlr_strdup (wordNext (w));
    currEntry->percentIdentity = atof (wordNext (w));
    currEntry->alignmentLength = atoi (wordNext (w));
    currEntry->misMatches = atoi (wordNext (w));
    currEntry->gapOpenings = atoi (wordNext (w));
    currEntry->qStart = atoi (wordNext (w));
    currEntry->qEnd = atoi (wordNext (w));
    currEntry->tStart = atoi (wordNext (w));
    currEntry->tEnd = atoi (wordNext (w));
    currEntry->evalue = atof (wordNext (w));
    currEntry->bitScore = atof (wordNext (w));
    wordIterDestroy (w);
}
Ejemplo n.º 11
0
void performSegmentation (Array tars, Array wigs, char* targetName, double threshold, int maxGap, int minRun)
{
  Tar *currTar;
  Wig *currWig,*nextWig;
  int i,j,endPosition;
  int countBelowThreshold;

  i = 0; 
  while (i < arrayMax (wigs)) {
    currWig = arrp (wigs,i,Wig);
    if (currWig->value < threshold) {
      i++;
      continue;
    }
    j = i + 1;
    endPosition = j;
    countBelowThreshold = 0;
    while (j < arrayMax (wigs)) {
      nextWig = arrp (wigs,j,Wig);
      if (nextWig->value < threshold) {
        countBelowThreshold++;
        if (countBelowThreshold >= maxGap) {
          break;
        }
      }
      else {
        countBelowThreshold = 0;
        endPosition = j;
      }
      j++;
    }
    if ((endPosition - 1 - currWig->position + 1) >= minRun) {
      currTar = arrayp (tars,arrayMax (tars),Tar);
      currTar->start = currWig->position;
      currTar->end = endPosition + 1;
      currTar->targetName = hlr_strdup (targetName);
     }
    i = j;
  }
}
Ejemplo n.º 12
0
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  BLEntry *currBLE;
  BLEntry currQuery;
  FILE *fp;
  char *line;
  int count;
  int countRemoved;
  
  int index;
  WordIter w;
  Array blackList = arrayCreate(20, BLEntry);
  config *Conf;

  if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) {
    die("%s:\tCannot find .fusionseqrc: %s", argv[0], getenv("FUSIONSEQ_CONFPATH"));
    return EXIT_FAILURE;
  }
  if( confp_get( Conf, "ANNOTATION_DIR")==NULL ) {
    die("%s:\tCannot find ANNOTATION_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  if( confp_get( Conf, "BLACKLIST_FILENAME")==NULL ) {
    die("%s:\tCannot find BLACKLIST_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  Stringa buffer=stringCreate( 100 );
  stringPrintf( buffer, "%s/%s", confp_get( Conf, "ANNOTATION_DIR"), confp_get( Conf, "BLACKLIST_FILENAME") );
  /*  fp = fopen( string( buffer ), "r" );
  if( !fp )  die("Unable to open file: %s", string(buffer));
  stringDestroy( buffer );
  */ 
// reading blacklist file
  LineStream ls = ls_createFromFile( string(buffer) );
  while( line = ls_nextLine(ls) ) {
    w = wordIterCreate( line, "\t", 1);
    currBLE = arrayp( blackList, arrayMax(blackList), BLEntry);
    currBLE->gene1 = hlr_strdup ( wordNext(w) );
    currBLE->gene2 = hlr_strdup ( wordNext(w) );    
    wordIterDestroy(w);
  }
  //fclose(fp);
  ls_destroy( ls );
  stringDestroy( buffer );
  arraySort( blackList, (ARRAYORDERF) sortBlackListByName1);

  // beginFiltering
  count = 0;
  countRemoved = 0;
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()) { // reading the gfr
    if( currGE->geneSymbolTranscript1 == NULL ) {
      die("Gene symbols are not present in the GFR file. Please run gfrAddInfo before gfrBlackListFilter.");
      return EXIT_FAILURE;
    }
	
    // creating a new query to the black list
    currQuery.gene1 = currGE->geneSymbolTranscript1;
    currQuery.gene2 = currGE->geneSymbolTranscript2;
    if( strEqual( currQuery.gene1 , currQuery.gene2 ) ) {
	countRemoved++;
	continue;
      }
    // searching against read_1/read_2
    int res = arrayFind( blackList, &currQuery, 
			 &index,  (ARRAYORDERF) sortBlackListByName1);  
    
    if( !res ) { // not found, then searching against read_2/read_1
      currQuery.gene1 = currGE->geneSymbolTranscript2;
      currQuery.gene2 = currGE->geneSymbolTranscript1;
      
      res =  arrayFind( blackList, &currQuery, 
			&index, (ARRAYORDERF) sortBlackListByName1 );
      
      if( !res ) { // not found, write the instance to stdout, update the counts
	puts (gfr_writeGfrEntry (currGE));
	count++;	
      } else { // found: read2/read1
	countRemoved++;
      }	
    } else { //found: read1/read2
      countRemoved++;
    }
  }	           
  gfr_deInit ();
  arrayDestroy( blackList );
  warn ("%s_BlackListFilter: %s",argv[0], confp_get( Conf, "BLACKLIST_FILENAME"));
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);
  confp_close( Conf);
  return 0;
}
Ejemplo n.º 13
0
int main (int argc, char *argv[])
{
	Array kgXrefs;
	Stringa buffer;
	LineStream ls;
	int count=0;
	char* geneSymbolTranscript;
	char* descriptionTranscript;
	char* line;
	char* exonID = NULL;

	config *conf;

	if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL)
		return EXIT_FAILURE;

	buffer = stringCreate (100);

	stringPrintf (buffer,"%s/%s",
		      confp_get(conf, "ANNOTATION_DIR"),
		      confp_get(conf, "KNOWN_GENE_XREF_FILENAME"));
	kgXrefs = util_readKnownGeneXrefs (string (buffer));
	arraySort (kgXrefs,(ARRAYORDERF)sortKgXrefsByTranscriptName);
	stringDestroy (buffer);

	//  gfr_init ("-");
	 ls = ls_createFromFile("-");
  
	while (line = ls_nextLine(ls)) {
		char *lineP = hlr_strdup(line);
		WordIter w = wordIterCreate( line, "\t", 0);
		char *nameTranscript = wordNext( w );
		char *p = rindex(nameTranscript, '_');
		if (p) {
			exonID = hlr_strdup( p+1 );
			*p='\0';
		}
		transcript2geneSymbolAndGeneDescription(kgXrefs,
							nameTranscript,
							&geneSymbolTranscript,
							&descriptionTranscript);
		if (exonID) {
			printf("%s_%s\t%s\t%s\t%s", 
				nameTranscript, 
				exonID,
				geneSymbolTranscript, 
				exonID, 
				descriptionTranscript);
			hlr_free(exonID);
		} else {
			printf("%s\t%s\t1\t%s", 
				nameTranscript, 
			
	geneSymbolTranscript, 
				descriptionTranscript);
		}
		printf("%s\n", lineP+strlen(nameTranscript));
		count++;
		hlr_free(lineP);
		wordIterDestroy(w);
	}
	ls_destroy (ls);
	warn ("%s_numGfrEntries: %d",argv[0],count);
	confp_close(conf);

	return EXIT_SUCCESS;
}
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  int i,j,k,l, h,index;
  Stringa buffer,cmd,fnSequencesToAlign;
  FILE *fp;
  FILE *fp1;
  FILE *fp2;
  FILE *freads1;
  FILE *freads2;
  Array gfrEntries;
  BowtieQuery *currBQ,testBQ;
  BowtieEntry *currBE;
  Texta seqNames;
  int readSize1, readSize2, minReadSize;
  Array bowtieQueries;
  char transcriptNumber;
  int isHomologous,homologousCount;
  int count;
  int countRemoved;
  unsigned short int tooMany;
  BlatQuery *blQ;

  config *conf;

  if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) {
    die("%s:\tCannot find .fusionseqrc", argv[0]);
    return EXIT_FAILURE;
  } 
  if ( (confp_get( conf, "BLAT_TWO_BIT_TO_FA")) == NULL) {
    die("%s:\tCannot find BLAT_TWO_BIT_TO_FA in the configuration file: %s", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  } 
  if ( (confp_get( conf,"BLAT_DATA_DIR")) == NULL) {
    die("%s:\tCannot find BLAT_DATA_DIR in the configuration file: %sc", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  } 
 if( confp_get( conf, "TMP_DIR")==NULL ) {
    die("%s:\tCannot find TMP_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  if( confp_get( conf, "BLAT_GFSERVER")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "BLAT_GFCLIENT")==NULL ) {
    die("%s:\tCannot find BLAT_GFCLIENT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
if( confp_get( conf, "BLAT_GFSERVER_HOST")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER_HOST in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }if( confp_get( conf, "BLAT_GFSERVER_PORT")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER_PORT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "PSEUDOGENE_DIR")==NULL ) {
   die("%s:\tCannot find PSEUDOGENE_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
   return EXIT_FAILURE;
 }
 if( confp_get( conf, "PSEUDOGENE_FILENAME")==NULL ) {
   die("%s:\tCannot find PSEUDOGENE_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
   return EXIT_FAILURE;
 }
 
  cmd = stringCreate (100);
  // initializing the gfServers
  stringPrintf( cmd, "%s status %s %s &> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT") );
  int ret = hlr_system( string(cmd), 1 );
  if( ret != 0 ) { // not initialized
    stringPrintf( cmd , "%s -repMatch=100000 -tileSize=12 -canStop -log=%s/gfServer_genome.log start %s %s %s/%s  &", confp_get( conf, "BLAT_GFSERVER"), confp_get(conf, "TMP_DIR"),confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"), confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"));
    hlr_system( string( cmd ), 0 );
    long int startTime = time(0);
    stringPrintf( cmd , "%s status %s %s &2> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"));
    while( hlr_system( string(cmd), 1) && (time(0)-startTime)<600 ) ;
    if( hlr_system( string(cmd), 1 ) != 0 )  {
      die("gfServer for %s/%s not initialized: %s %s %s", confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT")); 
      return EXIT_FAILURE;
    }
  } 
  // end initialization

  
  gfr_init ("-");
  gfrEntries =  gfr_parse ();
  if (arrayMax (gfrEntries) == 0){
    puts (gfr_writeHeader ());
    gfr_deInit ();
    return 0;
  }
  seqNames = textCreate (10000); 
  buffer = stringCreate (100);
  fnSequencesToAlign = stringCreate (100);
  count = 0;
  countRemoved = 0;

  stringPrintf( buffer, "%s/%s", confp_get( conf, "PSEUDOGENE_DIR"), confp_get( conf, "PSEUDOGENE_FILENAME") );
  intervalFind_addIntervalsToSearchSpace (string(buffer),0);

  puts (gfr_writeHeader ());
 
  for (i = 0; i < arrayMax (gfrEntries); i++) {
    currGE = arrp (gfrEntries,i,GfrEntry);
    homologousCount = 0;
    minReadSize=10000;
    // creating two fasta files with the two genes
    
    stringPrintf( cmd, "%s %s/%s -seq=%s -start=%d -end=%d %s/%s_transcript1.fa", confp_get(conf, "BLAT_TWO_BIT_TO_FA") , confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), currGE->chromosomeTranscript1, currGE->startTranscript1, currGE->endTranscript1, confp_get(conf, "TMP_DIR"), currGE->id);
    hlr_system( string(cmd) , 0);   
    stringPrintf( cmd, "%s %s/%s -seq=%s -start=%d -end=%d %s/%s_transcript2.fa", confp_get(conf, "BLAT_TWO_BIT_TO_FA"),  confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), currGE->chromosomeTranscript2, currGE->startTranscript2, currGE->endTranscript2, confp_get(conf, "TMP_DIR"), currGE->id);
    hlr_system( string(cmd) , 0);   
    
    Stringa fa1 = stringCreate( 100 ); 
    Stringa fa2 = stringCreate( 100 );
    
    // creating the two fasta files with the reads
    stringPrintf( fa1, "%s/%s_reads1.fa", confp_get(conf, "TMP_DIR"), currGE->id);
    if (!(freads1 = fopen ( string(fa1) ,"w"))) {
      die ("Unable to open file: %s",string (fa1));
    }   
    // writing the reads of the first end into file
    
    for (l = 0; l < arrayMax (currGE->readsTranscript1); l++) {
      char* currRead1 = hlr_strdup( textItem (currGE->readsTranscript1,l)); // read1
      readSize1 = strlen( currRead1 );
      if( readSize1 == 0 ) die("Read size cannot be zero: read1[ %s ]", currRead1);
      if( readSize1 < minReadSize ) minReadSize = readSize1;
      fprintf( freads1, ">%d\n%s\n", l, currRead1 );
      hlr_free( currRead1 );
    }
    fclose( freads1 );
    
    stringPrintf( fa2, "%s/%s_reads2.fa", confp_get(conf, "TMP_DIR"), currGE->id);
    if (!(freads2 = fopen ( string(fa2) ,"w"))) {
      die ("Unable to open file: %s",string (fa2));
    } 
    // writing the reads of the second end into file
    for (l = 0; l < arrayMax (currGE->readsTranscript2); l++) {
      char* currRead2 = hlr_strdup( textItem (currGE->readsTranscript2,l)); // read2
      readSize2 = strlen( currRead2 );
      if( readSize2 == 0 ) die("Read size cannot be zero: read2[ %s ]", currRead2);
      if( readSize2 < minReadSize ) minReadSize = readSize2;
      fprintf( freads2, ">%d\n%s\n", l, currRead2 );
      hlr_free( currRead2 );
    }
    fclose( freads2 );      
    
    // collapse the reads 2  ## requires the FASTX package
    stringPrintf( cmd, "%s -i %s/%s_reads2.fa -o %s/%s_reads2.collapsed.fa", confp_get(conf, "FASTX_COLLAPSER"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id  );
    hlr_system (string (cmd),0);
    
    //blat of reads2 against the first transcript
    stringPrintf( cmd, "%s -t=dna -out=psl -fine -tileSize=15 %s/%s_transcript1.fa %s/%s_reads2.collapsed.fa stdout",confp_get(conf, "BLAT_BLAT"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id );
    
    // reading the results of blast from Pipe
    blatParser_initFromPipe( string(cmd) );
    while( blQ = blatParser_nextQuery() ) {
      int nucleotideOverlap = getNucleotideOverlap ( blQ );
      if ( nucleotideOverlap > ( ((double)readSize2)* atof(confp_get(conf,"MAX_OVERLAP_ALLOWED"))) ) {
	char* value = strchr(blQ->qName,'-');
	homologousCount+=atoi(value+1);
      }
    }
    blatParser_deInit();
    
    // collapse the reads 1 ## requires the FASTX package on the path
    stringPrintf( cmd, "%s -i %s/%s_reads1.fa -o %s/%s_reads1.collapsed.fa", confp_get(conf, "FASTX_COLLAPSER"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id  );
    hlr_system (string (cmd),0);
    
    //blat of reads1 against the second transcript
    stringPrintf( cmd, "%s -t=dna -out=psl -fine -tileSize=15 %s/%s_transcript2.fa %s/%s_reads1.collapsed.fa stdout",confp_get(conf, "BLAT_BLAT"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id  );
    
    blatParser_initFromPipe( string(cmd) );
    while( blQ = blatParser_nextQuery() ) {		
      int nucleotideOverlap = getNucleotideOverlap ( blQ );
      if ( nucleotideOverlap > ( ((double)readSize1)* atof(confp_get(conf,"MAX_OVERLAP_ALLOWED"))) ) {
	char* value = strchr(blQ->qName,'-');
	homologousCount+=atoi(value+1);
      }
    }
    blatParser_deInit();
    stringPrintf (cmd,"cd %s;rm -rf %s_reads?.fa %s_reads?.collapsed.fa %s_transcript?.fa", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id);
    hlr_system( string(cmd) , 0);      
    if (((double)homologousCount / (double)arrayMax(currGE->readsTranscript1)) <= atof(confp_get(conf, "MAX_FRACTION_HOMOLOGOUS")) ) { 
      homologousCount = 0;
      // there is no homology between the two genes, but what about the rest of the genome
      writeFasta( currGE, &minReadSize,  confp_get(conf, "TMP_DIR") );
      stringPrintf(cmd, "cd %s; %s %s %s / -t=dna -q=dna -minScore=%d -out=psl %s_reads.fa %s.smallhomology.psl &>/dev/null", confp_get(conf, "TMP_DIR"), confp_get( conf, "BLAT_GFCLIENT"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"), minReadSize - (int)(0.1 * minReadSize) > 20 ? minReadSize - (int) (0.1 * minReadSize) : 20 ,  currGE->id,  currGE->id);
      int attempts=0;
      ret = hlr_system( string(cmd), 1 );
      while( hlr_system( string(cmd), 1 ) && attempts<5000 ) attempts++;
      if( attempts == 5000 ) {
	die("Cannot map the reads %s", string( cmd ));
	return EXIT_FAILURE;
      }
      // reading the results of blast from File
      stringPrintf(cmd,  "%s/%s.smallhomology.psl", confp_get( conf, "TMP_DIR"), currGE->id);
      blatParser_initFromFile( string(cmd) );
      tooMany = 1;
      while( blQ = blatParser_nextQuery() ) {
	tooMany = 0;
	checkPseudogeneOverlap( blQ );
	if( arrayMax( blQ->entries ) > 1 ) {
	  homologousCount+= arrayMax( blQ->entries ) - 1;
	  char* value = strchr( blQ->qName,'/' );
	  if( value ) *value = '\0'; else die("Not a valid index in the blat query name:\t%s", blQ->qName );
	  int indexOfInter = atoi( blQ->qName ); // the following three lines should removed the read if writing the GFR entry
	  GfrInterRead *currGIR = arrp( currGE->interReads, indexOfInter, GfrInterRead );
	  currGIR->flag = 1;
	}
      }
      blatParser_deInit();
      if (  tooMany == 1 || ( ( (double) homologousCount / (double) ( arrayMax(currGE->readsTranscript1) + arrayMax(currGE->readsTranscript2) ) )  > atof(confp_get(conf, "MAX_FRACTION_HOMOLOGOUS")) ) ) {
	countRemoved++;
	stringPrintf (cmd,"cd %s; rm -rf %s_reads*.fa %s_reads?.collapsed.fa %s_transcript?.fa %s.smallhomology.psl", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id,currGE->id);
	hlr_system( string(cmd), 1 );
	continue;
      }
      // writing the gfrEntry, if everthing else didn't stop 
      if( homologousCount > 0 ) updateStats( currGE );
      puts (gfr_writeGfrEntry (currGE));
      count++;
      // removing temporary files
      stringPrintf (cmd,"cd %s;rm -rf %s_reads*.fa %s_reads?.collapsed.fa %s_transcript?.fa  %s.smallhomology.psl", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id,currGE->id);
      hlr_system( string(cmd) , 1);      
    } else {
      countRemoved++;
    }
    
  }

  gfr_deInit ();

  stringDestroy (fnSequencesToAlign);
  stringDestroy (cmd);
  stringDestroy (buffer);
  warn ("%s_numRemoved: %d",argv[0],countRemoved);  
  warn ("%s_numGfrEntries: %d",argv[0],count);

  confp_close(conf);

  return EXIT_SUCCESS;
}
Ejemplo n.º 15
0
int main (int argc, char *argv[]) 
{
  char *queryString;

  if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL)
    return EXIT_FAILURE;

  cgiInit();
  cgiHeader("text/html");
  queryString = cgiGet2Post();
  if (queryString[0] == '\0') {
    puts ("<html>");
    puts ("<head>");
    html_printGenericStyleSheet (12);
    puts ("<title>geneFusions</title>\n");
    puts ("</head>");
    puts ("<body>");
    puts ("<h1>Identification of potential gene fusions using paired-end reads</h1><br><br>");
    printf ("<form action=%s/geneFusions_cgi method=get>", confp_get(Conf, "WEB_URL_CGI"));
    puts ("<b>Data prefix</b>:&nbsp;");
    puts ("<input type=text name=prefix>");
    puts ("<br><br><br>");
    puts ("<b>Minimum number of paired-end reads connecting two genes</b>:&nbsp;");
    puts ("<select name=minNum>");
    puts ("<option value=2>2");
    puts ("<option value=3>3");
    puts ("<option value=5 selected>5");
    puts ("<option value=10>10");
    puts ("</select>");
    puts ("<br><br><br>");
    puts ("<b>Type of gene fusion</b>:&nbsp;");
    puts ("<select name=type>");
    puts ("<option value=read-through>Read-through events");
    puts ("<option value=cis>Cis events");
    puts ("<option value=intra>Intra-chromosomal events");
    puts ("<option value=same>Genes on the same chromosome");
    puts ("<option value=inter>Genes on different chromosomes");
    puts ("<option value=all selected>All potential gene fusions");
    puts ("</select>");
    puts ("<br><br><br>");
    puts ("<input type=submit value=Submit>");
    puts ("<input type=reset value=Reset>");
    puts ("</form>");
    puts ("</body>");
    puts ("</html>");
    fflush (stdout);
  }
  else {
    int first;
    Stringa item = stringCreate (20);
    Stringa value = stringCreate (20);
    char *iPtr,*vPtr,*prefix,*type;
    int minNum;

    first = 1;
    cgiGetInit ();
    while (cgiGetNextPair (&first,item,value)) {
      iPtr = string (item);
      vPtr = string (value);
      if (strEqual (iPtr,"prefix")) {
	prefix = hlr_strdup (vPtr);
      }
      if (strEqual (iPtr,"type")) {
	type = hlr_strdup (vPtr);
      }
      if (strEqual (iPtr,"minNum")) {
	minNum = atoi (vPtr);
      }
    }
    generateOutput (prefix,type,minNum);
  }
  confp_close(Conf);

  return EXIT_SUCCESS;
}
Ejemplo n.º 16
0
int main (int argc, char *argv[])
{
  Array intervals;
  Interval *currInterval;
  SubInterval *currSubInterval;
  int h,i,j;
  Array seqs;
  Seq *currSeq,testSeq;
  int index;
  Stringa buffer;
  Array geneTranscriptEntries;
  Texta geneTranscriptIds;
  Array alterations;
  Alteration *currAlteration,*nextAlteration;
  char *proteinSequenceBeforeIndel;
  char *proteinSequenceAfterIndel;
  int numDisabledTranscripts;
  Stringa disabledTranscripts;
  int seqLength,refLength,altLength;
  char *sequenceBeforeIndel = NULL;
  int overlapMode;
  int numOverlaps;
  int sizeIndel,indelOffset;
  int overlap;
  Array coordinates;
  VcfEntry *currVcfEntry;
  VcfGenotype *currVcfGenotype;
  int position;
  Texta alternateAlleles;
  int flag1,flag2;
  
  if (argc != 3) {
    usage ("%s <annotation.interval> <annotation.fa>",argv[0]);
  }
  intervalFind_addIntervalsToSearchSpace (argv[1],0);
  geneTranscriptEntries = util_getGeneTranscriptEntries (intervalFind_getAllIntervals ());
  seq_init ();
  fasta_initFromFile (argv[2]);
  seqs = fasta_readAllSequences (0);
  fasta_deInit ();
  arraySort (seqs,(ARRAYORDERF)util_sortSequencesByName); 
  buffer = stringCreate (100);
  disabledTranscripts = stringCreate (100);
  alterations = arrayCreate (100,Alteration);
  vcf_init ("-");
  stringPrintf (buffer,"##INFO=<ID=VA,Number=.,Type=String,Description=\"Variant Annotation, %s\">",argv[1]);
  vcf_addComment (string (buffer));
  puts (vcf_writeMetaData ());
  puts (vcf_writeColumnHeaders ());
  while (currVcfEntry = vcf_nextEntry ()) {
    if (vcf_isInvalidEntry (currVcfEntry)) {
      continue;
    }
    flag1 = 0;
    flag2 = 0;
    position = currVcfEntry->position - 1; // make zero-based
    alternateAlleles = vcf_getAlternateAlleles (currVcfEntry);
    for (h = 0; h < arrayMax (alternateAlleles); h++) {
      refLength = strlen (currVcfEntry->referenceAllele);
      altLength = strlen (textItem (alternateAlleles,h));
      sizeIndel = abs (refLength - altLength);
      indelOffset = MAX (refLength,altLength) - 1; 
      util_clearAlterations (alterations);
      intervals = intervalFind_getOverlappingIntervals (currVcfEntry->chromosome,position,position + indelOffset);
      for (i = 0; i < arrayMax (intervals); i++) {
        currInterval = arru (intervals,i,Interval*);
        overlapMode = OVERLAP_NONE;
        numOverlaps = 0;
        for (j = 0; j < arrayMax (currInterval->subIntervals); j++) {
          currSubInterval = arrp (currInterval->subIntervals,j,SubInterval);
          overlap = rangeIntersection (position,position + indelOffset,currSubInterval->start,currSubInterval->end);
          if (currSubInterval->start <= position && (position + indelOffset) < currSubInterval->end) {
            overlapMode = OVERLAP_FULLY_CONTAINED;
            numOverlaps++;
          }
          else if (j == 0 && overlap > 0 && position < currSubInterval->start) {
            overlapMode = OVERLAP_START;
            numOverlaps++;
          }
          else if (j == (arrayMax (currInterval->subIntervals) - 1) && overlap > 0 && (position + indelOffset) >= currSubInterval->end) {
            overlapMode = OVERLAP_END;
            numOverlaps++;
          }
          else if (overlap > 0 && overlap <= indelOffset) {
            overlapMode = OVERLAP_SPLICE;
            numOverlaps++;
          }
        }
        if (overlapMode == OVERLAP_NONE) {
          continue;
        }
        currAlteration = arrayp (alterations,arrayMax (alterations),Alteration);
        if (numOverlaps > 1) {
          util_addAlteration (currAlteration,currInterval->name,"multiExonHit",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_SPLICE) {
          util_addAlteration (currAlteration,currInterval->name,"spliceOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_START) {
          util_addAlteration (currAlteration,currInterval->name,"startOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_END) {
          util_addAlteration (currAlteration,currInterval->name,"endOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength > refLength) {
          if ((sizeIndel % 3) == 0) {
            util_addAlteration (currAlteration,currInterval->name,"insertionNFS",currInterval,position,0);
          }
          else {
            util_addAlteration (currAlteration,currInterval->name,"insertionFS",currInterval,position,0);
          }
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength < refLength) {
          if ((sizeIndel % 3) == 0) {
            util_addAlteration (currAlteration,currInterval->name,"deletionNFS",currInterval,position,0);
          }
          else {
            util_addAlteration (currAlteration,currInterval->name,"deletionFS",currInterval,position,0);
          }
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength == refLength) {
          util_addAlteration (currAlteration,currInterval->name,"substitution",currInterval,position,0);
        }
        else {
          die ("Unexpected type: %d %s %s %s",
               currVcfEntry->position,currVcfEntry->chromosome,
               currVcfEntry->referenceAllele,currVcfEntry->alternateAllele);
        }
        if ((sizeIndel % 3) != 0 && altLength != refLength) { 
          continue;
        }
        // Only run the remaining block of code if the indel is fully contained (insertion or deletion) AND does not cause a frameshift OR
        // if it is a substitution that is fully contained in the coding sequence
        stringPrintf (buffer,"%s|%s|%c|",currInterval->name,currInterval->chromosome,currInterval->strand);
        for (j = 0; j < arrayMax (currInterval->subIntervals); j++) {
          currSubInterval = arrp (currInterval->subIntervals,j,SubInterval);
          stringAppendf (buffer,"%d|%d%s",currSubInterval->start,currSubInterval->end,j < arrayMax (currInterval->subIntervals) - 1 ? "|" : "");
        }
        testSeq.name = hlr_strdup (string (buffer));
        if (!arrayFind (seqs,&testSeq,&index,(ARRAYORDERF)util_sortSequencesByName)) {
          die ("Expected to find %s in seqs",string (buffer));
        }
        hlr_free (testSeq.name);
        currSeq = arrp (seqs,index,Seq);
        strReplace (&sequenceBeforeIndel,currSeq->sequence);
        seqLength = strlen (sequenceBeforeIndel); 
        coordinates = util_getCoordinates (currInterval);
        // arraySort (coordinates,(ARRAYORDERF)util_sortCoordinatesByChromosomeAndTranscriptPosition); Array is already sorted by definition
        j = 0;
        stringClear (buffer);
        while (j < seqLength) {
          if (util_getGenomicCoordinate (coordinates,j,currVcfEntry->chromosome) == position) {
            if (altLength > refLength) {
              stringCat (buffer,textItem (alternateAlleles,h));
              j++;
              continue;
            }
            else if (altLength < refLength) {
              stringCatChar (buffer,sequenceBeforeIndel[j]);
              j = j + refLength - altLength + 1;
              continue;
            }
            else {
              stringCat (buffer,textItem (alternateAlleles,h));
              j = j + altLength;
              continue;
            }
          }
          stringCatChar (buffer,sequenceBeforeIndel[j]);
          j++;
        }
        util_destroyCoordinates (coordinates);
        proteinSequenceBeforeIndel = hlr_strdup (util_translate (currInterval,sequenceBeforeIndel));
        proteinSequenceAfterIndel = hlr_strdup (util_translate (currInterval,string (buffer)));
        addSubstitution (currAlteration,proteinSequenceBeforeIndel,proteinSequenceAfterIndel,indelOffset);
        hlr_free (proteinSequenceBeforeIndel);
        hlr_free (proteinSequenceAfterIndel);
      }
      if (arrayMax (alterations) == 0) {
        continue;
      }
      arraySort (alterations,(ARRAYORDERF)util_sortAlterationsByGeneIdAndType);
      stringClear (buffer);
      i = 0;
      while (i < arrayMax (alterations)) {
        currAlteration = arrp (alterations,i,Alteration);
        stringAppendf (buffer,"%s%d:%s:%s:%c:%s",stringLen (buffer) == 0 ? "" : ",",h + 1,currAlteration->geneName,currAlteration->geneId,currAlteration->strand,currAlteration->type);
         stringClear (disabledTranscripts);
        if (currAlteration->substitution[0] != '\0') {
          stringAppendf (disabledTranscripts,"%s:%s:%d_%d_%s",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition,currAlteration->substitution);
        }
        else if (strEqual (currAlteration->type,"multiExonHit") || strEqual (currAlteration->type,"spliceOverlap") ||
                 strEqual (currAlteration->type,"startOverlap") || strEqual (currAlteration->type,"endOverlap")) {
          stringAppendf (disabledTranscripts,"%s:%s:%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength);
        }
        else {
          stringAppendf (disabledTranscripts,"%s:%s:%d_%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition);
        }
        numDisabledTranscripts = 1;
        j = i + 1;
        while (j < arrayMax (alterations)) {
          nextAlteration = arrp (alterations,j,Alteration);
          if (strEqual (currAlteration->geneId,nextAlteration->geneId) && 
              strEqual (currAlteration->type,nextAlteration->type)) {
            if (nextAlteration->substitution[0] != '\0') {
              stringAppendf (disabledTranscripts,":%s:%s:%d_%d_%s",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition,nextAlteration->substitution);
            }
            else if (strEqual (nextAlteration->type,"multiExonHit") || strEqual (nextAlteration->type,"spliceOverlap") ||
                     strEqual (nextAlteration->type,"startOverlap") || strEqual (nextAlteration->type,"endOverlap")) {
              stringAppendf (disabledTranscripts,":%s:%s:%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength);
            }
            else {
              stringAppendf (disabledTranscripts,":%s:%s:%d_%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition);
            }
            numDisabledTranscripts++;
          }
          else {
            break;
          }
          j++;
        }
        i = j;
        geneTranscriptIds = util_getTranscriptIdsForGeneId (geneTranscriptEntries,currAlteration->geneId);
        stringAppendf (buffer,":%d/%d:%s",numDisabledTranscripts,arrayMax (geneTranscriptIds),string (disabledTranscripts));
      }
      if (flag1 == 0) {
        printf ("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s;VA=",
                currVcfEntry->chromosome,currVcfEntry->position,currVcfEntry->id,
                currVcfEntry->referenceAllele,currVcfEntry->alternateAllele,
                currVcfEntry->quality,currVcfEntry->filter,currVcfEntry->info);
        flag1 = 1;
      }
      printf ("%s%s",flag2 == 1 ? "," : "",string (buffer)); 
      flag2 = 1;
    }
    if (flag1 == 1) {
      for (i = 0; i < arrayMax (currVcfEntry->genotypes); i++) {
        currVcfGenotype = arrp (currVcfEntry->genotypes,i,VcfGenotype);
        if (i == 0) {
          printf ("\t%s\t",currVcfEntry->genotypeFormat);
        }
        printf ("%s%s%s%s",currVcfGenotype->genotype,
                currVcfGenotype->details[0] != '\0' ? ":" : "",
                currVcfGenotype->details[0] != '\0' ?  currVcfGenotype->details : "",
                i < arrayMax (currVcfEntry->genotypes) - 1 ? "\t" : ""); 
      }
      puts ("");
    }
  }
  vcf_deInit ();
  return 0;
}
Ejemplo n.º 17
0
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  BLEntry *currBLE;
  BLEntry currQuery;
  FILE *fp;
  char *line;
  int count;
  int countRemoved;
  
  int index;
  WordIter w;
  Array blackList = arrayCreate(20, BLEntry);

  if (argc != 2) {
    usage ("%s <blackList.txt>",argv[0]);
  }  
  fp = fopen( argv[1], "r" );
  
  if( !fp )  die("Unable to open file: %s", argv[1]);
  // reading blacklist file
  LineStream ls = ls_createFromFile( argv[1] );
  while( line = ls_nextLine(ls) ) {
    w = wordIterCreate( line, "\t", 1);
    currBLE = arrayp( blackList, arrayMax(blackList), BLEntry);
    currBLE->gene1 = hlr_strdup ( wordNext(w) );
    currBLE->gene2 = hlr_strdup ( wordNext(w) );    
    wordIterDestroy(w);
  }
  fclose(fp);
  arraySort( blackList, (ARRAYORDERF) sortBlackListByName1);

  // beginFiltering
  count = 0;
  countRemoved = 0;
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()) { // reading the gfr
    // creating a new query to the black list
    currQuery.gene1 = currGE->geneSymbolTranscript1;
    currQuery.gene2 = currGE->geneSymbolTranscript2;
    // searching against read_1/read_2
    int res = arrayFind( blackList, &currQuery, 
			 &index,  (ARRAYORDERF) sortBlackListByName1);  
    
    if( !res ) { // not found, then searching against read_2/read_1
      currQuery.gene1 = currGE->geneSymbolTranscript2;
      currQuery.gene2 = currGE->geneSymbolTranscript1;
      
      res =  arrayFind( blackList, &currQuery, 
			&index, (ARRAYORDERF) sortBlackListByName1 );
      
      if( !res ) { // not found, write the instance to stdout, update the counts
	puts (gfr_writeGfrEntry (currGE));
	count++;	
      } else { // found: read2/read1
	countRemoved++;
      }	
    } else { //found: read1/read2
      countRemoved++;
    }
  }	           
  gfr_deInit ();
  arrayDestroy( blackList );
  warn ("%s_BlackListFilter: %s",argv[0], argv[1]);
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);
  return 0;
}