void doTest(int * array, int n) {
  printf("arrayMax(");
  if (array == NULL) {
    printf("NULL");
  }
  else {
    printf("{");
    for (int i =0; i < n; i++) {
      printf("%d", array[i]);
      if (i < n -1) {
	printf(", ");
      }
    }
    printf("}");
  }
  printf(", %d) is \n", n);
  int * p = arrayMax (array, n);
  if (p == NULL) {
    printf("NULL\n");
  }
  else {
    printf("%d\n", *p);
  }
}
Beispiel #2
0
SEXP c_read_biokit_exprs (SEXP filename) {
  LineStream ls;
  char* line;
  const int MAND_NCOL=7; // the first column is the row name, and column 2-7 are mandatory
  int add_ncol=0;
  Texta it;
  Texta rnames=textCreate(128);
  Array mrpkms=arrayCreate(128, double);
  Array mreads=arrayCreate(128, int);
  Array srpkms=arrayCreate(128, double);
  Array sreads=arrayCreate(128, int);
  Array mprop=arrayCreate(128, double);
  Array allmap = arrayCreate(128, int);
  Array annos=arrayCreate(128, Texta);
  Texta anno=NULL; // must have a NULL assigned; otherwise textCreateClear leads to memory error
  Stringa str=stringCreate(8);

  SEXP R_rnames, R_mrpkms, R_mreads, R_srpkms, R_sreads, R_mprop, R_allmap, R_res;
  SEXP R_colnames, R_class;
  
  int nprot=0;
  int i=0;
  int j=0;
  int nrow=0;
  const char* fn=CHAR(STRING_ELT(filename, 0));
  ls = ls_createFromFile(strdup(fn));

  ls_nextLine(ls); // skip the first header line
  while(line = ls_nextLine(ls)) {
    it = textFieldtokP(line, "\t");
    if(arrayMax(it)<MAND_NCOL)
      error("Input file must contain no less than %d columns", MAND_NCOL);

    textAdd(rnames, textItem(it, 0));
    array(mrpkms, arrayMax(mrpkms), double)=atof(textItem(it, 1));
    array(mreads, arrayMax(mreads), int)=atoi(textItem(it, 2));
    array(srpkms, arrayMax(srpkms), double)=atof(textItem(it, 3));
    array(sreads, arrayMax(sreads), int)=atoi(textItem(it, 4));
    array(mprop, arrayMax(mprop), double)=atof(textItem(it, 5));
    array(allmap, arrayMax(allmap), int)=atoi(textItem(it, 6));

    add_ncol = max(arrayMax(it)-MAND_NCOL, add_ncol);
    textCreateClear(anno, arrayMax(it)-MAND_NCOL);
    for(i=MAND_NCOL; i<arrayMax(it);  ++i) {
      textAdd(anno, textItem(it, i));
    }
    array(annos, arrayMax(annos), Texta)=textClone(anno);
    nrow++;
  }

  R_rnames=PROTECT(allocVector(STRSXP, nrow)); nprot++;
  R_mrpkms=PROTECT(allocVector(REALSXP, nrow)); nprot++;
  R_mreads=PROTECT(allocVector(INTSXP, nrow)); nprot++;
  R_srpkms=PROTECT(allocVector(REALSXP, nrow)); nprot++;
  R_sreads=PROTECT(allocVector(INTSXP, nrow)); nprot++;
  R_mprop=PROTECT(allocVector(REALSXP, nrow)); nprot++;
  R_allmap=PROTECT(allocVector(INTSXP, nrow)); nprot++;

  for(i=0; i<nrow; ++i) {
    SET_STRING_ELT(R_rnames, i, mkChar(textItem(rnames, i)));
    REAL(R_mrpkms)[i]=arru(mrpkms, i, double);
    INTEGER(R_mreads)[i]=arru(mreads, i, int);
    REAL(R_srpkms)[i]=arru(srpkms, i, double);
    INTEGER(R_sreads)[i]=arru(sreads, i, int);
    REAL(R_mprop)[i]=arru(mprop, i, double);
    INTEGER(R_allmap)[i]=arru(allmap, i, int);
  }

  R_res=PROTECT(allocVector(VECSXP, MAND_NCOL+add_ncol-1)); nprot++;
  SET_VECTOR_ELT(R_res, 0, R_mrpkms);
  SET_VECTOR_ELT(R_res, 1, R_mreads);
  SET_VECTOR_ELT(R_res, 2, R_srpkms);
  SET_VECTOR_ELT(R_res, 3, R_sreads);
  SET_VECTOR_ELT(R_res, 4, R_mprop);
  SET_VECTOR_ELT(R_res, 5, R_allmap);
  for(i=0; i<add_ncol; ++i) {
    SEXP R_anno=NULL;
    R_anno=PROTECT(allocVector(STRSXP, nrow));
    for(j=0; j<nrow; ++j) {
      anno=array(annos, j, Texta);
      if(arrayMax(anno)>i) {
         SET_STRING_ELT(R_anno, j, mkChar(textItem(anno, i)));
      } else {
         SET_STRING_ELT(R_anno, j, R_NaString);
      }
    }
    SET_VECTOR_ELT(R_res, i+MAND_NCOL-1, R_anno); // -1 because the first column is row name
    UNPROTECT(1);
  }

  PROTECT(R_colnames=allocVector(STRSXP, MAND_NCOL+add_ncol-1)); nprot++;
  PROTECT(R_class=allocVector(STRSXP, 1)); nprot++;
  SET_STRING_ELT(R_colnames, 0, mkChar("RPKM_MultiMap"));
  SET_STRING_ELT(R_colnames, 1, mkChar("ReadCount_MultiMap"));
  SET_STRING_ELT(R_colnames, 2, mkChar("RPKM_UniqMap"));
  SET_STRING_ELT(R_colnames, 3, mkChar("ReadCount_UniqMap"));
  SET_STRING_ELT(R_colnames, 4, mkChar("MultiProp"));
  SET_STRING_ELT(R_colnames, 5, mkChar("AllMappingReads"));
  for(i=0; i<add_ncol; ++i) {
    stringPrintf(str, "Annotation%d", i+1);
    SET_STRING_ELT(R_colnames, i+MAND_NCOL-1,
                   mkChar(string(str)));
  }
  SET_STRING_ELT(R_class, 0, mkChar("data.frame"));
  setAttrib(R_res, install("names"), R_colnames);
  setAttrib(R_res, install("row.names"), R_rnames);
  setAttrib(R_res, install("class"), R_class);

  for(i=0; i<nrow; ++i) {
    textDestroy(array(annos, i, Texta));
  }
  arrayDestroy(annos);
  arrayDestroy(rnames);
  arrayDestroy(mrpkms);
  arrayDestroy(mreads);
  arrayDestroy(srpkms);
  arrayDestroy(sreads);
  arrayDestroy(mprop);
  arrayDestroy(allmap);
  stringDestroy(str);

  ls_destroy(ls);
  UNPROTECT(nprot);
  return(R_res);
}
/*Moves the motors simultaneously the number of steps given in the array.
The motors will start and stop at the same time, no matter the selected
speed and number of steps (the motor that would take the longest is taken
as reference, and the speed of other motors adjusted accordingly).*/
int StepperControl::moveAll(long *stepsE){
	movementCompleted = 0; //Resetting status.
	int exitStatus = 0; //Return value of the external function that can be added to the loop.
	uint16_t mask = 0; //Each bit contains info about what motor will be disabled (1 disabled, 0 enabled).
	
	long steps[motorsCount]; //Local array containing the number of steps to be executed by each motor.
	if(steps != NULL) memcpy(steps, stepsE, motorsCount*sizeof(long));
	else return 2; //End function if the given array is not valid.
	
	long minMovementDuration; //The lowest amount of time the slowest motor (or the one with the most steps to do) will need to complete it's movement.
	long durations[motorsCount]; //Used for calculation of time-dependent values. Has different contents.
	int slowestMotor = 0; //Index of the motor of which the movement will take the longest.
	int dir[motorsCount]; //Contains movement direction of the motors.
	
	totalSteps = 0; //Resetting the number of steps to be done.
	
	int slMin; //Index of the motor with the lowest signalLength value among the active ones.
	unsigned long *slp; //Points to a copy of signalLength.
	slp = (unsigned long*)calloc(motorsCount, sizeof(unsigned long));
	memcpy(slp, signalLength, motorsCount*sizeof(unsigned long)); //Sloppy (signalLength has type long*), but if signalLength contains negative values, there is a bigger problem.
	
	//The direction pins of all motors are set here depending on whether or not the number of steps is negative.
	//The total number of state changes to be done is also calculated here.
	for(int i = 0; i < motorsCount; i++){
		if(steps[i] < 0){
			dir[i] = 1;
			steps[i] = -steps[i];
		}else{
			dir[i] = 0;
		}
		totalSteps += steps[i];
		setDir(i, dir[i]); //Setting the correct values to the direction pins.
		
		durations[i] = movementTime[i] * steps[i];
		
		if(!steps[i]){
			mask |= (1 << i);
			slp[i] = 0; //Causes arrayMin in the next step to ignore the motors that will be inactive.
		}
	}
	totalSteps = totalSteps*2;
	
	//Set starting conditions (without this part the signal polarity can be wrong).
	slMin = arrayMin(slp, motorsCount, 0);
	for(int i = 0; i < motorsCount; i++){
		if(signalLength[i] > signalLength[slMin] && !(mask & (1 << i))){
			stepStatus[i] = 1;
			currentState[i] = 1;
		}else{
			stepStatus[i] = 0;
			currentState[i] = 0;
		}
	}
	free(slp);//We won't need slp any more.

	//Here, the form and duration of the impulses is determined.
	slowestMotor = arrayMax(durations, motorsCount);
	minMovementDuration = durations[slowestMotor];
	for(int i = 0; i < motorsCount; i++){
		durations[i] = minMovementDuration / steps[i];
		if(durations[i] > signalLength[i]) durations[i] -= signalLength[i];
		else durations[i] = 0;
		
		if(durations[i] > 0){
			lowTime[i] = (long)((clockFrequency/(64*(1000000.0/durations[i]))));
			highTime[i] = (long)((clockFrequency/(64*(1000000.0/signalLength[i]))));
		}else{
			lowTime[i] = 0;
			highTime[i] = 0;
		}
	}
	
	//Actual movement starts to happen here.
	//Setting all motors to HIGH at start.
	taskList[3][0] = 0xFFFF;
	fillTaskList(mask);//Preparing the buffer.
	enableInterrupt();//Starting movement.
	while(!movementCompleted){
		//if(idleCounter) Serial.println(idleCounter); //debug
		exitStatus = repeatInLoop(); //External function to be executed during the movement.
		if(exitStatus == 1) break; //If the external function returns 1, the movemen stops.
		fillTaskList(mask);
	}
	disableInterrupt();//Stopping interrupts, so that they won't interfere with the program.
	
	//Movement ended, resetting values.
	stepsCounter = 0;
	stepsCounterCalc = 0;
	movementCompleted = 0;
	executionIndex = 0;
	calculationIndex = 1;
	clearTaskList();
	for(int i = 0; i < motorsCount; i++){
		//individualStepsCounter[i] = 0; //Use resetIndividualStepsCounter() instead.
		currentState[i] = 0;
		stepStatus[i] = 0;
		nextStep[i] = 0;
	}
	
	return exitStatus;
}
Beispiel #4
0
int main (int argc, char *argv[])
{
  Array intervals;
  Interval *currInterval;
  SubInterval *currSubInterval;
  int refLength,altLength,offset;
  int h,i,j;
  Stringa buffer;
  Array geneTranscriptEntries;
  Texta geneTranscriptIds;
  Array alterations;
  Alteration *currAlteration,*nextAlteration;
  int numTranscripts;
  Stringa transcripts;
  VcfEntry *currVcfEntry;
  int position;
  Texta alternateAlleles;
  int flag1,flag2;
  VcfGenotype *currVcfGenotype;
 
  if (argc != 3) {
    usage ("%s <annotation.interval> <nameFeature>",argv[0]);
  }
  intervalFind_addIntervalsToSearchSpace (argv[1],0);
  geneTranscriptEntries = util_getGeneTranscriptEntries (intervalFind_getAllIntervals ());
  buffer = stringCreate (100);
  transcripts = stringCreate (100);
  alterations = arrayCreate (100,Alteration);
  vcf_init ("-");
  stringPrintf (buffer,"##INFO=<ID=VA,Number=.,Type=String,Description=\"Variant Annotation, %s, %s\">",argv[1],argv[2]);
  vcf_addComment (string (buffer));
  puts (vcf_writeMetaData ());
  puts (vcf_writeColumnHeaders ());
  while (currVcfEntry = vcf_nextEntry ()) {
    if (vcf_isInvalidEntry (currVcfEntry)) {
      continue;
    }
    flag1 = 0;
    flag2 = 0;
    position = currVcfEntry->position - 1; // make zero-based
    alternateAlleles = vcf_getAlternateAlleles (currVcfEntry);
    for (h = 0; h < arrayMax (alternateAlleles); h++) {
      refLength = strlen (currVcfEntry->referenceAllele);
      altLength = strlen (textItem (alternateAlleles,h));
      offset = MAX (refLength,altLength) - 1; 
      util_clearAlterations (alterations);
      intervals = intervalFind_getOverlappingIntervals (currVcfEntry->chromosome,position,position + offset);
      for (i = 0; i < arrayMax (intervals); i++) {
        currInterval = arru (intervals,i,Interval*);
        j = 0; 
        while (j < arrayMax (currInterval->subIntervals)) {
          currSubInterval = arrp (currInterval->subIntervals,j,SubInterval);
          if (currSubInterval->start <= position && (position + offset) < currSubInterval->end) {
            break;
          }
          j++;
        }
        if (j == arrayMax (currInterval->subIntervals)) {
          continue;
        }
        util_addAlteration (arrayp (alterations,arrayMax (alterations),Alteration),currInterval->name,argv[2],currInterval,position,0);
      }
      if (arrayMax (alterations) == 0) {
        continue;
      }
      arraySort (alterations,(ARRAYORDERF)util_sortAlterationsByGeneIdAndType);
      stringClear (buffer);
      i = 0;
      while (i < arrayMax (alterations)) {
        currAlteration = arrp (alterations,i,Alteration);
        stringAppendf (buffer,"%s%d:%s:%s:%c:%s",stringLen (buffer) == 0 ? "" : "|",h + 1,currAlteration->geneName,currAlteration->geneId,currAlteration->strand,currAlteration->type);
        stringClear (transcripts);
        stringAppendf (transcripts,"%s:%s:%d_%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition);
        numTranscripts = 1;
        j = i + 1;
        while (j < arrayMax (alterations)) {
          nextAlteration = arrp (alterations,j,Alteration);
          if (strEqual (currAlteration->geneId,nextAlteration->geneId) && 
              strEqual (currAlteration->type,nextAlteration->type)) {
            stringAppendf (transcripts,":%s:%s:%d_%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition);
            numTranscripts++;
          }
          else {
            break;
          }
          j++;
        }
        i = j;
        geneTranscriptIds = util_getTranscriptIdsForGeneId (geneTranscriptEntries,currAlteration->geneId);
        stringAppendf (buffer,":%d/%d:%s",numTranscripts,arrayMax (geneTranscriptIds),string (transcripts));
      }
      if (flag1 == 0) {
        printf ("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s;VA=",
                currVcfEntry->chromosome,currVcfEntry->position,currVcfEntry->id,
                currVcfEntry->referenceAllele,currVcfEntry->alternateAllele,
                currVcfEntry->quality,currVcfEntry->filter,currVcfEntry->info);
        flag1 = 1;
      }
      printf ("%s%s",flag2 == 1 ? "," : "",string (buffer)); 
      flag2 = 1;
    }
    if (flag1 == 1) {
      for (i = 0; i < arrayMax (currVcfEntry->genotypes); i++) {
        currVcfGenotype = arrp (currVcfEntry->genotypes,i,VcfGenotype);
        if (i == 0) {
          printf ("\t%s\t",currVcfEntry->genotypeFormat);
        }
        printf ("%s%s%s%s",currVcfGenotype->genotype,
                currVcfGenotype->details[0] != '\0' ? ":" : "",
                currVcfGenotype->details[0] != '\0' ?  currVcfGenotype->details : "",
                i < arrayMax (currVcfEntry->genotypes) - 1 ? "\t" : ""); 
      }
      puts ("");
    }
  }
  vcf_deInit ();
  return 0;
}
void pageit(Pentry q[MAXPROCESSES]) { 

    /* This file contains the stub for an LRU pager */
    /* You may need to add/remove/modify any part of this file */
	//printf("%ld",processes[0]->pid);
    /* Static vars */
    static int initialized = 0;
    //static int tick = 1; // artificial time
    
	//printf("here\n");
    /* Local vars */
    int proctmp;
int minPageTwo;
	int secondMinPageOne;
	int secondMinPageTwo;
    int pagetmp;
	int secondMaxPage;
	int secondsecondMaxPage;
	int pc;
	int page;
	static int procount=0;
	int min;
	int pageOne;
	int predOne;
	int predTwo;
	int predThree;

		int swapRow=0;
		int swapCol=0;
		int tmppage;
		int proc;
	int pageTwo;
	int minPage;
	int i;
	int minPageOne;
    /* initialize static vars on first run */
    if(!initialized){
	for(proctmp=0; proctmp < MAXPROCESSES; proctmp++){
	    for(pagetmp=0; pagetmp < MAXPROCPAGES; pagetmp++){
				timestamps[proctmp][pagetmp] = 0;
				pageLastRemovedAt[proctmp][pagetmp]=0; 
				numberofpagesout=0;
		for(i=0; i<2;i++){
			betRevlentPage[proctmp][pagetmp][i]=0;

		}
		for(i=0;i<MAXPROCPAGES;i++){
			revelentPageFreq[proctmp][pagetmp][i]=0;
		}
	    }
		lastPageCalled[proctmp]=0;
	}
	tick=1;
	prevtick=0;
	initialized = 1;
    }
    
	
	//pager();
	for(proc=0; proc <MAXPROCESSES; proc++){
		//printf("1\n");	
		if(q[proc].active){
			
			//printf("2\n");
			pc = q[proc].pc;
			page=pc/PAGESIZE;
			/*if(procount>=4){

				pager(q);
			}*/

			timestamps[proc][page] = tick;
			
			


			tmppage=lastPageCalled[page];
			betRevlentPage[proc][tmppage][1]=betRevlentPage[proc][tmppage][0];
			betRevlentPage[proc][tmppage][0]=page;


			predOne=betRevlentPage[proc][page][0];
			predTwo=betRevlentPage[proc][predOne][0];
			predThree=betRevlentPage[proc][predTwo][0];


			revelentPageFreq[proc][tmppage][page]++;
			lastPageCalled[proc]=page;

			pageOne=arrayMax(proc,page,&secondMaxPage);
			pageTwo=arrayMax(proc,pageOne,&secondsecondMaxPage);

			minPageOne=arrayMin(q,proc,page,&secondMinPageOne);
			minPageTwo=arrayMin(q,proc,pageOne,&secondMinPageTwo);


			minPage=lru(q,proc,page);
			pagein(proc,page);
			pagein(proc,pageTwo);
			pagein(proc,pageOne);
			/*if(secondMaxPage!=-100 && secondMaxPage!=secondMinPageOne){
				pagein(proc,secondMaxPage);
				if(secondMinPageOne!=-100){
					pageout(proc, secondMinPageOne);
				}
			}
			if(secondMaxPage!=-100 && secondMaxPage!=secondMinPageTwo){
				pagein(proc,secondsecondMaxPage);*
				if(secondMinPageTwo!=-100){
					pageout(proc,secondMinPageTwo);
				}
			}*/
			//pagein(proc,predThree);
			//pagein(proc,predTwo);
			//pagein(proc,predOne);


			if(!q[proc].pages[page]){
				effChecker(proc,page);
				if(!pagein(proc,page)){
					pageout(proc,minPage);
					pageout(proc,minPageOne);
					pageout(proc,minPageTwo);
					if(secondMinPageTwo!=-100){
						pageout(proc,secondMinPageTwo);
					}
					if(secondMinPageOne!=-100){
						pageout(proc, secondMinPageOne);
					}
					

				}else if(!pagein(proc,pageTwo)){
					pageout(proc,minPage);
					pageout(proc,minPageOne);
					if(secondMinPageTwo!=-100){
						pageout(proc,secondMinPageTwo);
					}
					if(secondMinPageOne!=-100){
						pageout(proc, secondMinPageOne);
					}
					

				}else if(!pagein(proc,pageOne)){
					//pageout(proc,minPage);
					pageout(proc,minPage);
					pageout(proc,minPageOne);
					pageout(proc,minPageTwo);
					//pageout(proc,minPageOne);
					if(secondMinPageTwo!=-100){
						pageout(proc,secondMinPageTwo);
					}
					if(secondMinPageOne!=-100){
						pageout(proc, secondMinPageOne);
					}
					
				}

			}

			
				


		}else{//remove all things related to the process if not active
			for(i=0;i<MAXPROCPAGES;i++){
				pageout(proc,i);
				procount++;
			}

		}
 		
	}



    /* TODO: Implement LRU Paging */
	//pager(q);

    /* advance time for next pageit iteration */
	pager(q);
    tick++;
} 
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  int count;
  int countRemoved;
  int mitochondrialCount; 
  unsigned int minReadSize;
  int  i;
  Stringa cmd;
  BlatQuery *blQ=NULL;
  config *conf = NULL; /**< Pointer to configuration file .fusionseqrc  */

  if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) {
    die("%s:\tCannot find .fusionseqrc: %s", argv[0], getenv("FUSIONSEQ_CONFPATH"));
    return EXIT_FAILURE;
  }
  if( confp_get( conf,"MAX_OVERLAP_ALLOWED")==NULL ) {
    die("%s:\tCannot find MAX_OVERLAP_ALLOWED in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  if( confp_get( conf,"MAX_FRACTION_HOMOLOGOUS")==NULL ) {
    die("%s:\tCannot find MAX_FRACTION_HOMOLOGOUS in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "MITOCHONDRIAL_DIR")==NULL ) {
    die("%s:\tCannot find MITOCHONDRIAL_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  if( confp_get( conf, "MITOCHONDRIAL_FILENAME")==NULL ) {
    die("%s:\tCannot find MITOCHONDRIAL_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
if( confp_get( conf, "TMP_DIR")==NULL ) {
    die("%s:\tCannot find TMP_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "BLAT_GFSERVER")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "BLAT_GFCLIENT")==NULL ) {
    die("%s:\tCannot find BLAT_GFCLIENT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
if( confp_get( conf, "BLAT_GFSERVER_HOST")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER_HOST in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }if( confp_get( conf, "BLAT_GFSERVER_PORT")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER_PORT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }

  count = 0;
  countRemoved = 0;
  
  cmd = stringCreate (100);
  // initializing the gfServers
  stringPrintf( cmd, "%s status %s %d &> /dev/null", confp_get( conf, "BLAT_GFSERVER"),  confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2);
  int ret = hlr_system( string(cmd), 1 );
   if( ret != 0 ) { // not initialized
    stringPrintf( cmd , "%s -repMatch=100000 -tileSize=12 -canStop -log=%s/gfServer_mitochondrial.log start %s %d %s/%s  &", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "TMP_DIR"),  confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2, confp_get( conf, "MITOCHONDRIAL_DIR"), confp_get( conf,"MITOCHONDRIAL_FILENAME"));
    hlr_system( string( cmd ), 0 );
    long int startTime = time(0);
    stringPrintf( cmd , "%s status %s %d &> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2);
    while( hlr_system( string(cmd), 1) && (time(0)-startTime)<600 ) ;
    if( hlr_system( string(cmd), 1 ) != 0 )  {
      die("gfServer for %s/%s not initialized: %s %s %s", confp_get( conf, "MITOCHONDRIAL_DIR"), confp_get( conf, "MITOCHONDRIAL_FILENAME"), confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT")); 
      return EXIT_FAILURE;
    }
  } 

 
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()) {
    if (strEqual(currGE->chromosomeTranscript1, "chrM") || 
	strEqual(currGE->chromosomeTranscript2, "chrM")) {
      countRemoved++;
      continue;
    } else {
      mitochondrialCount = 0;
      minReadSize=1000;
      writeFasta( currGE, &minReadSize, confp_get( conf, "TMP_DIR") ); // in util.c
      stringPrintf(cmd, "cd %s;%s %s %d / -t=dna -q=dna -minScore=%d -out=psl %s_reads.fa %s.mito.psl &>/dev/null", confp_get( conf, "TMP_DIR"), confp_get( conf, "BLAT_GFCLIENT"), confp_get( conf, "BLAT_GFSERVER_HOST"), atoi(confp_get( conf, "BLAT_GFSERVER_PORT")) + 2, minReadSize - 5 > 20 ? minReadSize - 5 : 20 , currGE->id, currGE->id);
      int attempts=0;
      ret = hlr_system( string(cmd), 1 );
      while( hlr_system( string(cmd), 1 ) && attempts<5000 ) attempts++;
      if( attempts == 5000 ) {
	die("Cannot map the reads %s", string( cmd ));
	return EXIT_FAILURE;
      }

      // reading the results of blast from File
      stringPrintf(cmd,  "%s/%s.mito.psl", confp_get( conf, "TMP_DIR"), currGE->id);
      blatParser_initFromFile( string(cmd) );
      while( blQ = blatParser_nextQuery() ) {
	//warn("iter %d\tquery %s", iter, blQ->qName );iter++; 
	int nucleotideOverlap = getNucleotideOverlap ( blQ );
	if (nucleotideOverlap > (((double) minReadSize) * strtod(confp_get( conf, "MAX_OVERLAP_ALLOWED"), NULL))) {
	  char* value = strchr( blQ->qName,'/' );
	  if( value ) *value = '\0'; else die("Not a valid index in the blat query name:\t%s", blQ->qName );
	  int indexOfInter = atoi( blQ->qName ); // the following three lines should removed the read if writing the GFR entry
	  GfrInterRead *currGIR = arrp( currGE->interReads, indexOfInter, GfrInterRead );
	  currGIR->flag = 1;
	  mitochondrialCount++;
	} 
      }
      blatParser_deInit();
      if ( ( (double) mitochondrialCount / (double) ( arrayMax(currGE->readsTranscript1) + arrayMax(currGE->readsTranscript2) ) ) <= strtod(confp_get( conf, "MAX_FRACTION_HOMOLOGOUS"), NULL)) {   
	if( mitochondrialCount > 0 ) updateStats( currGE );
	// writing the gfrEntry
	puts (gfr_writeGfrEntry (currGE));
	count++;
      } else {
	countRemoved++;
      }
      // removing temporary files
      stringPrintf (cmd,"rm -rf %s/%s_reads.fa %s/%s.mito.psl", confp_get( conf, "TMP_DIR"),  currGE->id, confp_get( conf, "TMP_DIR"),  currGE->id );
      hlr_system( string(cmd) , 1);      
    } 
    
  }
  gfr_deInit ();
 
  stringDestroy( cmd );
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);
  confp_close(conf);
  return 0;
}
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  int i,j,k,l, h,index;
  Stringa buffer,cmd,fnSequencesToAlign;
  FILE *fp;
  FILE *fp1;
  FILE *fp2;
  FILE *freads1;
  FILE *freads2;
  Array gfrEntries;
  BowtieQuery *currBQ,testBQ;
  BowtieEntry *currBE;
  Texta seqNames;
  int readSize1, readSize2, minReadSize;
  Array bowtieQueries;
  char transcriptNumber;
  int isHomologous,homologousCount;
  int count;
  int countRemoved;
  unsigned short int tooMany;
  BlatQuery *blQ;

  config *conf;

  if ((conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) {
    die("%s:\tCannot find .fusionseqrc", argv[0]);
    return EXIT_FAILURE;
  } 
  if ( (confp_get( conf, "BLAT_TWO_BIT_TO_FA")) == NULL) {
    die("%s:\tCannot find BLAT_TWO_BIT_TO_FA in the configuration file: %s", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  } 
  if ( (confp_get( conf,"BLAT_DATA_DIR")) == NULL) {
    die("%s:\tCannot find BLAT_DATA_DIR in the configuration file: %sc", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  } 
 if( confp_get( conf, "TMP_DIR")==NULL ) {
    die("%s:\tCannot find TMP_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  if( confp_get( conf, "BLAT_GFSERVER")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "BLAT_GFCLIENT")==NULL ) {
    die("%s:\tCannot find BLAT_GFCLIENT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
if( confp_get( conf, "BLAT_GFSERVER_HOST")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER_HOST in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }if( confp_get( conf, "BLAT_GFSERVER_PORT")==NULL ) {
    die("%s:\tCannot find BLAT_GFSERVER_PORT in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
 if( confp_get( conf, "PSEUDOGENE_DIR")==NULL ) {
   die("%s:\tCannot find PSEUDOGENE_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
   return EXIT_FAILURE;
 }
 if( confp_get( conf, "PSEUDOGENE_FILENAME")==NULL ) {
   die("%s:\tCannot find PSEUDOGENE_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
   return EXIT_FAILURE;
 }
 
  cmd = stringCreate (100);
  // initializing the gfServers
  stringPrintf( cmd, "%s status %s %s &> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT") );
  int ret = hlr_system( string(cmd), 1 );
  if( ret != 0 ) { // not initialized
    stringPrintf( cmd , "%s -repMatch=100000 -tileSize=12 -canStop -log=%s/gfServer_genome.log start %s %s %s/%s  &", confp_get( conf, "BLAT_GFSERVER"), confp_get(conf, "TMP_DIR"),confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"), confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"));
    hlr_system( string( cmd ), 0 );
    long int startTime = time(0);
    stringPrintf( cmd , "%s status %s %s &2> /dev/null", confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"));
    while( hlr_system( string(cmd), 1) && (time(0)-startTime)<600 ) ;
    if( hlr_system( string(cmd), 1 ) != 0 )  {
      die("gfServer for %s/%s not initialized: %s %s %s", confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), confp_get( conf, "BLAT_GFSERVER"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT")); 
      return EXIT_FAILURE;
    }
  } 
  // end initialization

  
  gfr_init ("-");
  gfrEntries =  gfr_parse ();
  if (arrayMax (gfrEntries) == 0){
    puts (gfr_writeHeader ());
    gfr_deInit ();
    return 0;
  }
  seqNames = textCreate (10000); 
  buffer = stringCreate (100);
  fnSequencesToAlign = stringCreate (100);
  count = 0;
  countRemoved = 0;

  stringPrintf( buffer, "%s/%s", confp_get( conf, "PSEUDOGENE_DIR"), confp_get( conf, "PSEUDOGENE_FILENAME") );
  intervalFind_addIntervalsToSearchSpace (string(buffer),0);

  puts (gfr_writeHeader ());
 
  for (i = 0; i < arrayMax (gfrEntries); i++) {
    currGE = arrp (gfrEntries,i,GfrEntry);
    homologousCount = 0;
    minReadSize=10000;
    // creating two fasta files with the two genes
    
    stringPrintf( cmd, "%s %s/%s -seq=%s -start=%d -end=%d %s/%s_transcript1.fa", confp_get(conf, "BLAT_TWO_BIT_TO_FA") , confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), currGE->chromosomeTranscript1, currGE->startTranscript1, currGE->endTranscript1, confp_get(conf, "TMP_DIR"), currGE->id);
    hlr_system( string(cmd) , 0);   
    stringPrintf( cmd, "%s %s/%s -seq=%s -start=%d -end=%d %s/%s_transcript2.fa", confp_get(conf, "BLAT_TWO_BIT_TO_FA"),  confp_get(conf, "BLAT_DATA_DIR"), confp_get(conf, "BLAT_TWO_BIT_DATA_FILENAME"), currGE->chromosomeTranscript2, currGE->startTranscript2, currGE->endTranscript2, confp_get(conf, "TMP_DIR"), currGE->id);
    hlr_system( string(cmd) , 0);   
    
    Stringa fa1 = stringCreate( 100 ); 
    Stringa fa2 = stringCreate( 100 );
    
    // creating the two fasta files with the reads
    stringPrintf( fa1, "%s/%s_reads1.fa", confp_get(conf, "TMP_DIR"), currGE->id);
    if (!(freads1 = fopen ( string(fa1) ,"w"))) {
      die ("Unable to open file: %s",string (fa1));
    }   
    // writing the reads of the first end into file
    
    for (l = 0; l < arrayMax (currGE->readsTranscript1); l++) {
      char* currRead1 = hlr_strdup( textItem (currGE->readsTranscript1,l)); // read1
      readSize1 = strlen( currRead1 );
      if( readSize1 == 0 ) die("Read size cannot be zero: read1[ %s ]", currRead1);
      if( readSize1 < minReadSize ) minReadSize = readSize1;
      fprintf( freads1, ">%d\n%s\n", l, currRead1 );
      hlr_free( currRead1 );
    }
    fclose( freads1 );
    
    stringPrintf( fa2, "%s/%s_reads2.fa", confp_get(conf, "TMP_DIR"), currGE->id);
    if (!(freads2 = fopen ( string(fa2) ,"w"))) {
      die ("Unable to open file: %s",string (fa2));
    } 
    // writing the reads of the second end into file
    for (l = 0; l < arrayMax (currGE->readsTranscript2); l++) {
      char* currRead2 = hlr_strdup( textItem (currGE->readsTranscript2,l)); // read2
      readSize2 = strlen( currRead2 );
      if( readSize2 == 0 ) die("Read size cannot be zero: read2[ %s ]", currRead2);
      if( readSize2 < minReadSize ) minReadSize = readSize2;
      fprintf( freads2, ">%d\n%s\n", l, currRead2 );
      hlr_free( currRead2 );
    }
    fclose( freads2 );      
    
    // collapse the reads 2  ## requires the FASTX package
    stringPrintf( cmd, "%s -i %s/%s_reads2.fa -o %s/%s_reads2.collapsed.fa", confp_get(conf, "FASTX_COLLAPSER"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id  );
    hlr_system (string (cmd),0);
    
    //blat of reads2 against the first transcript
    stringPrintf( cmd, "%s -t=dna -out=psl -fine -tileSize=15 %s/%s_transcript1.fa %s/%s_reads2.collapsed.fa stdout",confp_get(conf, "BLAT_BLAT"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id );
    
    // reading the results of blast from Pipe
    blatParser_initFromPipe( string(cmd) );
    while( blQ = blatParser_nextQuery() ) {
      int nucleotideOverlap = getNucleotideOverlap ( blQ );
      if ( nucleotideOverlap > ( ((double)readSize2)* atof(confp_get(conf,"MAX_OVERLAP_ALLOWED"))) ) {
	char* value = strchr(blQ->qName,'-');
	homologousCount+=atoi(value+1);
      }
    }
    blatParser_deInit();
    
    // collapse the reads 1 ## requires the FASTX package on the path
    stringPrintf( cmd, "%s -i %s/%s_reads1.fa -o %s/%s_reads1.collapsed.fa", confp_get(conf, "FASTX_COLLAPSER"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id  );
    hlr_system (string (cmd),0);
    
    //blat of reads1 against the second transcript
    stringPrintf( cmd, "%s -t=dna -out=psl -fine -tileSize=15 %s/%s_transcript2.fa %s/%s_reads1.collapsed.fa stdout",confp_get(conf, "BLAT_BLAT"), confp_get(conf, "TMP_DIR"), currGE->id, confp_get(conf, "TMP_DIR"), currGE->id  );
    
    blatParser_initFromPipe( string(cmd) );
    while( blQ = blatParser_nextQuery() ) {		
      int nucleotideOverlap = getNucleotideOverlap ( blQ );
      if ( nucleotideOverlap > ( ((double)readSize1)* atof(confp_get(conf,"MAX_OVERLAP_ALLOWED"))) ) {
	char* value = strchr(blQ->qName,'-');
	homologousCount+=atoi(value+1);
      }
    }
    blatParser_deInit();
    stringPrintf (cmd,"cd %s;rm -rf %s_reads?.fa %s_reads?.collapsed.fa %s_transcript?.fa", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id);
    hlr_system( string(cmd) , 0);      
    if (((double)homologousCount / (double)arrayMax(currGE->readsTranscript1)) <= atof(confp_get(conf, "MAX_FRACTION_HOMOLOGOUS")) ) { 
      homologousCount = 0;
      // there is no homology between the two genes, but what about the rest of the genome
      writeFasta( currGE, &minReadSize,  confp_get(conf, "TMP_DIR") );
      stringPrintf(cmd, "cd %s; %s %s %s / -t=dna -q=dna -minScore=%d -out=psl %s_reads.fa %s.smallhomology.psl &>/dev/null", confp_get(conf, "TMP_DIR"), confp_get( conf, "BLAT_GFCLIENT"), confp_get( conf, "BLAT_GFSERVER_HOST"), confp_get( conf, "BLAT_GFSERVER_PORT"), minReadSize - (int)(0.1 * minReadSize) > 20 ? minReadSize - (int) (0.1 * minReadSize) : 20 ,  currGE->id,  currGE->id);
      int attempts=0;
      ret = hlr_system( string(cmd), 1 );
      while( hlr_system( string(cmd), 1 ) && attempts<5000 ) attempts++;
      if( attempts == 5000 ) {
	die("Cannot map the reads %s", string( cmd ));
	return EXIT_FAILURE;
      }
      // reading the results of blast from File
      stringPrintf(cmd,  "%s/%s.smallhomology.psl", confp_get( conf, "TMP_DIR"), currGE->id);
      blatParser_initFromFile( string(cmd) );
      tooMany = 1;
      while( blQ = blatParser_nextQuery() ) {
	tooMany = 0;
	checkPseudogeneOverlap( blQ );
	if( arrayMax( blQ->entries ) > 1 ) {
	  homologousCount+= arrayMax( blQ->entries ) - 1;
	  char* value = strchr( blQ->qName,'/' );
	  if( value ) *value = '\0'; else die("Not a valid index in the blat query name:\t%s", blQ->qName );
	  int indexOfInter = atoi( blQ->qName ); // the following three lines should removed the read if writing the GFR entry
	  GfrInterRead *currGIR = arrp( currGE->interReads, indexOfInter, GfrInterRead );
	  currGIR->flag = 1;
	}
      }
      blatParser_deInit();
      if (  tooMany == 1 || ( ( (double) homologousCount / (double) ( arrayMax(currGE->readsTranscript1) + arrayMax(currGE->readsTranscript2) ) )  > atof(confp_get(conf, "MAX_FRACTION_HOMOLOGOUS")) ) ) {
	countRemoved++;
	stringPrintf (cmd,"cd %s; rm -rf %s_reads*.fa %s_reads?.collapsed.fa %s_transcript?.fa %s.smallhomology.psl", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id,currGE->id);
	hlr_system( string(cmd), 1 );
	continue;
      }
      // writing the gfrEntry, if everthing else didn't stop 
      if( homologousCount > 0 ) updateStats( currGE );
      puts (gfr_writeGfrEntry (currGE));
      count++;
      // removing temporary files
      stringPrintf (cmd,"cd %s;rm -rf %s_reads*.fa %s_reads?.collapsed.fa %s_transcript?.fa  %s.smallhomology.psl", confp_get(conf, "TMP_DIR"), currGE->id,currGE->id,currGE->id,currGE->id);
      hlr_system( string(cmd) , 1);      
    } else {
      countRemoved++;
    }
    
  }

  gfr_deInit ();

  stringDestroy (fnSequencesToAlign);
  stringDestroy (cmd);
  stringDestroy (buffer);
  warn ("%s_numRemoved: %d",argv[0],countRemoved);  
  warn ("%s_numGfrEntries: %d",argv[0],count);

  confp_close(conf);

  return EXIT_SUCCESS;
}
Beispiel #8
0
int main (int argc, char **argv)
{
  LineStream ls;
  Texta tokens = NULL;
  char *line;

  int hasQual = 0;
  int hasSeqs = 0;
  int start=1;
 
  ls = ls_createFromFile ("-");
  while (line = ls_nextLine (ls)) {
    // Put all the lines of the SAM header in comments
    if (line[0] == '@') {
      printf ("# %s\n", line);
      continue;
    }
    // Parse each SAM entry and store into array   
    tokens = textFieldtokP (line, "\t");
    if (arrayMax (tokens) < 11) {
      textDestroy( tokens );
      ls_destroy (ls);
      die ("Invalid SAM entry: %s", line);
    }
    SamEntry *currSamE = NULL;
    SamEntry *mateSamE = NULL;
    AllocVar(currSamE ); 

    int ret = generateSamEntry( tokens, currSamE, &hasSeqs, &hasQual );
    textDestroy( tokens );
    if ( ret==0 ) {
      if ( isPaired ( currSamE ) )
	ls_nextLine( ls ); // discarding next entry too (the mate)
      destroySamEntry( currSamE );
      freeMem( currSamE );
      continue;
    }   
    if ( isPaired( currSamE ) )   {
      int hasQual2, hasSeq2;
      AllocVar( mateSamE );
      Texta secondEnd = NULL;
      secondEnd = textFieldtok (ls_nextLine( ls ) , "\t");
      ret = generateSamEntry( secondEnd, mateSamE, &hasSeq2, &hasQual2 );
      textDestroy( secondEnd );
      if( ret == 0 ) {
	destroySamEntry( currSamE );
	destroySamEntry( mateSamE );
	freeMem( currSamE );
	freeMem( mateSamE );
	continue;
      }
      if (strcmp (currSamE->qname, mateSamE->qname) != 0) {
        die ("Please note that for paired-end data, sam2mrf requires the mate pairs to be on subsequent lines. You may want to sort the SAM file first.\nEx: sort -r file.sam | sam2mrf > file.mrf\n");
      }
    } 

    // Print MRF headers
    if( start ) {
      printf ("%s", MRF_COLUMN_NAME_BLOCKS);
      if (hasSeqs) printf("\t%s", MRF_COLUMN_NAME_SEQUENCE);
      if (hasQual) printf("\t%s", MRF_COLUMN_NAME_QUALITY_SCORES);
      printf ("\t%s\n", MRF_COLUMN_NAME_QUERY_ID);
      start=0;
    }
    
    // Print AlignmentBlocks   
    printMrfAlignBlocks (currSamE, R_FIRST);
    if( isPaired ( currSamE ) ) {  
      printf ("|");
      printMrfAlignBlocks (mateSamE, R_SECOND);
    }

    seq_init();
    // Print Sequence
    if (hasSeqs) {
      if (!currSamE->seq)
        die ("Entry missing sequence column\n");
      if( currSamE->flags & S_QUERY_STRAND )
	seq_reverseComplement( currSamE->seq, strlen(currSamE->seq));
      printf ("\t%s", currSamE->seq);
      if (mateSamE) {
        if (!mateSamE->seq)
          die ("Entry missing sequence column\n");
        if( mateSamE->flags & S_MATE_STRAND )
	  seq_reverseComplement( mateSamE->seq, strlen(mateSamE->seq));
	printf ("|%s", mateSamE->seq);
      }
    }
    // Print quality scores
    if (hasQual) {
      if (!currSamE->qual)
        die ("Entry missing quality scores column\n");
      printf ("\t%s", currSamE->qual);
      if (mateSamE) {
        if (!mateSamE->qual)
          die ("Entry missing quality scores column\n");
        printf ("|%s", mateSamE->qual);
      }
    }

    // Print queryID

    if (mateSamE) {
      printf ("\t%s|%s", currSamE->qname,"2"); // No need to print out both IDs, but need the pipe symbol for consistency
    }
    else {
      printf ("\t%s", currSamE->qname);
    }
    printf("\n");
    
    destroySamEntry( currSamE );
    freeMem( currSamE ); 
    if( isPaired( currSamE ) ) {
      destroySamEntry ( mateSamE );
      freeMem( mateSamE );
    }
  }
  // clean up
  ls_destroy (ls);
  return EXIT_SUCCESS;
}
Beispiel #9
0
int main (int argc, char *argv[])
{
  Array intervals;
  Interval *currInterval;
  SubInterval *currSubInterval;
  int h,i,j;
  Array seqs;
  Seq *currSeq,testSeq;
  int index;
  Stringa buffer;
  Array geneTranscriptEntries;
  Texta geneTranscriptIds;
  Array alterations;
  Alteration *currAlteration,*nextAlteration;
  char *proteinSequenceBeforeIndel;
  char *proteinSequenceAfterIndel;
  int numDisabledTranscripts;
  Stringa disabledTranscripts;
  int seqLength,refLength,altLength;
  char *sequenceBeforeIndel = NULL;
  int overlapMode;
  int numOverlaps;
  int sizeIndel,indelOffset;
  int overlap;
  Array coordinates;
  VcfEntry *currVcfEntry;
  VcfGenotype *currVcfGenotype;
  int position;
  Texta alternateAlleles;
  int flag1,flag2;
  
  if (argc != 3) {
    usage ("%s <annotation.interval> <annotation.fa>",argv[0]);
  }
  intervalFind_addIntervalsToSearchSpace (argv[1],0);
  geneTranscriptEntries = util_getGeneTranscriptEntries (intervalFind_getAllIntervals ());
  seq_init ();
  fasta_initFromFile (argv[2]);
  seqs = fasta_readAllSequences (0);
  fasta_deInit ();
  arraySort (seqs,(ARRAYORDERF)util_sortSequencesByName); 
  buffer = stringCreate (100);
  disabledTranscripts = stringCreate (100);
  alterations = arrayCreate (100,Alteration);
  vcf_init ("-");
  stringPrintf (buffer,"##INFO=<ID=VA,Number=.,Type=String,Description=\"Variant Annotation, %s\">",argv[1]);
  vcf_addComment (string (buffer));
  puts (vcf_writeMetaData ());
  puts (vcf_writeColumnHeaders ());
  while (currVcfEntry = vcf_nextEntry ()) {
    if (vcf_isInvalidEntry (currVcfEntry)) {
      continue;
    }
    flag1 = 0;
    flag2 = 0;
    position = currVcfEntry->position - 1; // make zero-based
    alternateAlleles = vcf_getAlternateAlleles (currVcfEntry);
    for (h = 0; h < arrayMax (alternateAlleles); h++) {
      refLength = strlen (currVcfEntry->referenceAllele);
      altLength = strlen (textItem (alternateAlleles,h));
      sizeIndel = abs (refLength - altLength);
      indelOffset = MAX (refLength,altLength) - 1; 
      util_clearAlterations (alterations);
      intervals = intervalFind_getOverlappingIntervals (currVcfEntry->chromosome,position,position + indelOffset);
      for (i = 0; i < arrayMax (intervals); i++) {
        currInterval = arru (intervals,i,Interval*);
        overlapMode = OVERLAP_NONE;
        numOverlaps = 0;
        for (j = 0; j < arrayMax (currInterval->subIntervals); j++) {
          currSubInterval = arrp (currInterval->subIntervals,j,SubInterval);
          overlap = rangeIntersection (position,position + indelOffset,currSubInterval->start,currSubInterval->end);
          if (currSubInterval->start <= position && (position + indelOffset) < currSubInterval->end) {
            overlapMode = OVERLAP_FULLY_CONTAINED;
            numOverlaps++;
          }
          else if (j == 0 && overlap > 0 && position < currSubInterval->start) {
            overlapMode = OVERLAP_START;
            numOverlaps++;
          }
          else if (j == (arrayMax (currInterval->subIntervals) - 1) && overlap > 0 && (position + indelOffset) >= currSubInterval->end) {
            overlapMode = OVERLAP_END;
            numOverlaps++;
          }
          else if (overlap > 0 && overlap <= indelOffset) {
            overlapMode = OVERLAP_SPLICE;
            numOverlaps++;
          }
        }
        if (overlapMode == OVERLAP_NONE) {
          continue;
        }
        currAlteration = arrayp (alterations,arrayMax (alterations),Alteration);
        if (numOverlaps > 1) {
          util_addAlteration (currAlteration,currInterval->name,"multiExonHit",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_SPLICE) {
          util_addAlteration (currAlteration,currInterval->name,"spliceOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_START) {
          util_addAlteration (currAlteration,currInterval->name,"startOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_END) {
          util_addAlteration (currAlteration,currInterval->name,"endOverlap",currInterval,position,0);
          continue;
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength > refLength) {
          if ((sizeIndel % 3) == 0) {
            util_addAlteration (currAlteration,currInterval->name,"insertionNFS",currInterval,position,0);
          }
          else {
            util_addAlteration (currAlteration,currInterval->name,"insertionFS",currInterval,position,0);
          }
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength < refLength) {
          if ((sizeIndel % 3) == 0) {
            util_addAlteration (currAlteration,currInterval->name,"deletionNFS",currInterval,position,0);
          }
          else {
            util_addAlteration (currAlteration,currInterval->name,"deletionFS",currInterval,position,0);
          }
        }
        else if (numOverlaps == 1 && overlapMode == OVERLAP_FULLY_CONTAINED && altLength == refLength) {
          util_addAlteration (currAlteration,currInterval->name,"substitution",currInterval,position,0);
        }
        else {
          die ("Unexpected type: %d %s %s %s",
               currVcfEntry->position,currVcfEntry->chromosome,
               currVcfEntry->referenceAllele,currVcfEntry->alternateAllele);
        }
        if ((sizeIndel % 3) != 0 && altLength != refLength) { 
          continue;
        }
        // Only run the remaining block of code if the indel is fully contained (insertion or deletion) AND does not cause a frameshift OR
        // if it is a substitution that is fully contained in the coding sequence
        stringPrintf (buffer,"%s|%s|%c|",currInterval->name,currInterval->chromosome,currInterval->strand);
        for (j = 0; j < arrayMax (currInterval->subIntervals); j++) {
          currSubInterval = arrp (currInterval->subIntervals,j,SubInterval);
          stringAppendf (buffer,"%d|%d%s",currSubInterval->start,currSubInterval->end,j < arrayMax (currInterval->subIntervals) - 1 ? "|" : "");
        }
        testSeq.name = hlr_strdup (string (buffer));
        if (!arrayFind (seqs,&testSeq,&index,(ARRAYORDERF)util_sortSequencesByName)) {
          die ("Expected to find %s in seqs",string (buffer));
        }
        hlr_free (testSeq.name);
        currSeq = arrp (seqs,index,Seq);
        strReplace (&sequenceBeforeIndel,currSeq->sequence);
        seqLength = strlen (sequenceBeforeIndel); 
        coordinates = util_getCoordinates (currInterval);
        // arraySort (coordinates,(ARRAYORDERF)util_sortCoordinatesByChromosomeAndTranscriptPosition); Array is already sorted by definition
        j = 0;
        stringClear (buffer);
        while (j < seqLength) {
          if (util_getGenomicCoordinate (coordinates,j,currVcfEntry->chromosome) == position) {
            if (altLength > refLength) {
              stringCat (buffer,textItem (alternateAlleles,h));
              j++;
              continue;
            }
            else if (altLength < refLength) {
              stringCatChar (buffer,sequenceBeforeIndel[j]);
              j = j + refLength - altLength + 1;
              continue;
            }
            else {
              stringCat (buffer,textItem (alternateAlleles,h));
              j = j + altLength;
              continue;
            }
          }
          stringCatChar (buffer,sequenceBeforeIndel[j]);
          j++;
        }
        util_destroyCoordinates (coordinates);
        proteinSequenceBeforeIndel = hlr_strdup (util_translate (currInterval,sequenceBeforeIndel));
        proteinSequenceAfterIndel = hlr_strdup (util_translate (currInterval,string (buffer)));
        addSubstitution (currAlteration,proteinSequenceBeforeIndel,proteinSequenceAfterIndel,indelOffset);
        hlr_free (proteinSequenceBeforeIndel);
        hlr_free (proteinSequenceAfterIndel);
      }
      if (arrayMax (alterations) == 0) {
        continue;
      }
      arraySort (alterations,(ARRAYORDERF)util_sortAlterationsByGeneIdAndType);
      stringClear (buffer);
      i = 0;
      while (i < arrayMax (alterations)) {
        currAlteration = arrp (alterations,i,Alteration);
        stringAppendf (buffer,"%s%d:%s:%s:%c:%s",stringLen (buffer) == 0 ? "" : ",",h + 1,currAlteration->geneName,currAlteration->geneId,currAlteration->strand,currAlteration->type);
         stringClear (disabledTranscripts);
        if (currAlteration->substitution[0] != '\0') {
          stringAppendf (disabledTranscripts,"%s:%s:%d_%d_%s",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition,currAlteration->substitution);
        }
        else if (strEqual (currAlteration->type,"multiExonHit") || strEqual (currAlteration->type,"spliceOverlap") ||
                 strEqual (currAlteration->type,"startOverlap") || strEqual (currAlteration->type,"endOverlap")) {
          stringAppendf (disabledTranscripts,"%s:%s:%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength);
        }
        else {
          stringAppendf (disabledTranscripts,"%s:%s:%d_%d",currAlteration->transcriptName,currAlteration->transcriptId,currAlteration->transcriptLength,currAlteration->relativePosition);
        }
        numDisabledTranscripts = 1;
        j = i + 1;
        while (j < arrayMax (alterations)) {
          nextAlteration = arrp (alterations,j,Alteration);
          if (strEqual (currAlteration->geneId,nextAlteration->geneId) && 
              strEqual (currAlteration->type,nextAlteration->type)) {
            if (nextAlteration->substitution[0] != '\0') {
              stringAppendf (disabledTranscripts,":%s:%s:%d_%d_%s",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition,nextAlteration->substitution);
            }
            else if (strEqual (nextAlteration->type,"multiExonHit") || strEqual (nextAlteration->type,"spliceOverlap") ||
                     strEqual (nextAlteration->type,"startOverlap") || strEqual (nextAlteration->type,"endOverlap")) {
              stringAppendf (disabledTranscripts,":%s:%s:%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength);
            }
            else {
              stringAppendf (disabledTranscripts,":%s:%s:%d_%d",nextAlteration->transcriptName,nextAlteration->transcriptId,nextAlteration->transcriptLength,nextAlteration->relativePosition);
            }
            numDisabledTranscripts++;
          }
          else {
            break;
          }
          j++;
        }
        i = j;
        geneTranscriptIds = util_getTranscriptIdsForGeneId (geneTranscriptEntries,currAlteration->geneId);
        stringAppendf (buffer,":%d/%d:%s",numDisabledTranscripts,arrayMax (geneTranscriptIds),string (disabledTranscripts));
      }
      if (flag1 == 0) {
        printf ("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s;VA=",
                currVcfEntry->chromosome,currVcfEntry->position,currVcfEntry->id,
                currVcfEntry->referenceAllele,currVcfEntry->alternateAllele,
                currVcfEntry->quality,currVcfEntry->filter,currVcfEntry->info);
        flag1 = 1;
      }
      printf ("%s%s",flag2 == 1 ? "," : "",string (buffer)); 
      flag2 = 1;
    }
    if (flag1 == 1) {
      for (i = 0; i < arrayMax (currVcfEntry->genotypes); i++) {
        currVcfGenotype = arrp (currVcfEntry->genotypes,i,VcfGenotype);
        if (i == 0) {
          printf ("\t%s\t",currVcfEntry->genotypeFormat);
        }
        printf ("%s%s%s%s",currVcfGenotype->genotype,
                currVcfGenotype->details[0] != '\0' ? ":" : "",
                currVcfGenotype->details[0] != '\0' ?  currVcfGenotype->details : "",
                i < arrayMax (currVcfEntry->genotypes) - 1 ? "\t" : ""); 
      }
      puts ("");
    }
  }
  vcf_deInit ();
  return 0;
}
Beispiel #10
0
  }

  conf_printLinks (fp, confp_get(Conf, "WEB_DATA_DIR"), &rpos, prefix, locus, settings->readlim);
  conf_printFooter (fp);

  fclose (fp);
  stringDestroy (buffer);
  return 0;
}

static void randSelect (Array *arrPER, int readlim)
{
  Array new  = arrayCreate (readlim, PEreads);
  Array hash = arrayCreate (arrayMax (*arrPER), int);
  int rd;
  int max = arrayMax (*arrPER);
  int i, n;
  PEreads *newPER;

  // Initialize "hash table"
  for (i = 0; i < arrayMax(hash); i++)
    *(arrayp (hash, i, int)) = 0;

  n = 0;
  while (n < readlim) {
    rd = rand() % max;
    if (arru (hash, rd, int) == 0) {
      PEreads *oldPER = arrp (*arrPER, rd, PEreads);
      
      newPER = arrayp (new, n, PEreads);
      newPER->read1.chromosome = hlr_strdup (oldPER->read1.chromosome);
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  BLEntry *currBLE;
  BLEntry currQuery;
  FILE *fp;
  char *line;
  int count;
  int countRemoved;
  
  int index;
  WordIter w;
  Array blackList = arrayCreate(20, BLEntry);

  if (argc != 2) {
    usage ("%s <blackList.txt>",argv[0]);
  }  
  fp = fopen( argv[1], "r" );
  
  if( !fp )  die("Unable to open file: %s", argv[1]);
  // reading blacklist file
  LineStream ls = ls_createFromFile( argv[1] );
  while( line = ls_nextLine(ls) ) {
    w = wordIterCreate( line, "\t", 1);
    currBLE = arrayp( blackList, arrayMax(blackList), BLEntry);
    currBLE->gene1 = hlr_strdup ( wordNext(w) );
    currBLE->gene2 = hlr_strdup ( wordNext(w) );    
    wordIterDestroy(w);
  }
  fclose(fp);
  arraySort( blackList, (ARRAYORDERF) sortBlackListByName1);

  // beginFiltering
  count = 0;
  countRemoved = 0;
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()) { // reading the gfr
    // creating a new query to the black list
    currQuery.gene1 = currGE->geneSymbolTranscript1;
    currQuery.gene2 = currGE->geneSymbolTranscript2;
    // searching against read_1/read_2
    int res = arrayFind( blackList, &currQuery, 
			 &index,  (ARRAYORDERF) sortBlackListByName1);  
    
    if( !res ) { // not found, then searching against read_2/read_1
      currQuery.gene1 = currGE->geneSymbolTranscript2;
      currQuery.gene2 = currGE->geneSymbolTranscript1;
      
      res =  arrayFind( blackList, &currQuery, 
			&index, (ARRAYORDERF) sortBlackListByName1 );
      
      if( !res ) { // not found, write the instance to stdout, update the counts
	puts (gfr_writeGfrEntry (currGE));
	count++;	
      } else { // found: read2/read1
	countRemoved++;
      }	
    } else { //found: read1/read2
      countRemoved++;
    }
  }	           
  gfr_deInit ();
  arrayDestroy( blackList );
  warn ("%s_BlackListFilter: %s",argv[0], argv[1]);
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);
  return 0;
}
Beispiel #12
0
int main (int argc, char *argv[]) 
{
  FILE* ftmp = NULL;
  
  if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL)
    return EXIT_FAILURE;
  
  cgiInit();
  cgiHeader("text/html");

  if (argc == 3) {
    GfrEntry *currGE;
    Stringa buffer;
    GfrPairCount *currGEPC;
    GfrInterRead *currGIR;
    int i;

    puts ("<html>");
    puts ("<head>");
    html_printGenericStyleSheet (12);
    puts ("<title>geneFusions Details</title>\n");
    puts ("</head>");
    puts ("<body>");
    buffer = stringCreate (100);
    stringPrintf (buffer, "%s/%s.gfr", confp_get(Conf, "WEB_DATA_DIR"),argv[1]);    
    gfr_init (string (buffer));
    while (currGE = gfr_nextEntry ()){
      fflush( stdout );
      if (!strEqual (currGE->id,argv[2])) {
        continue;
      }
      printf ("<h1>Detailed summary for potential gene fusion candidate</h1><br>");
      puts ("<table border=0 cellpadding=10>");
      puts ("<tr align=left valign=top>");
      puts ("<td width=400>");
      puts ("<h2>Summary information</h2><br>");
      printf ("<b>Identifier</b>: %s<br><br>\n",currGE->id);
      printf ("<b>Number of inter paired-end reads</b>: %d<br><br>\n",currGE->numInter);
      printf ("<b>Type</b>: %s<br><br>\n",currGE->fusionType);     
      
      stringPrintf(buffer, "%s/GFF/%s.gff", confp_get(Conf, "WEB_DATA_DIR"),currGE->id);       
      ftmp = fopen( string(buffer), "r" ); // displaying this only if data are present
      if (ftmp) {
	 printf("<b>Connected Reads</b>: <a href=%s&hgt.customText=%s/GFF/%s.gff target=blank>UCSC connectivity graph</a><br>\n",
              	htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
			currGE->chromosomeTranscript1,
			currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
              confp_get(Conf, "WEB_DATA_LINK"),currGE->id); 
	 fclose( ftmp );
      }
   
      puts ("</td>");
      puts ("<td>");
      puts ("<h2>Transcript connectivity graph</h2>");
      printf ("<img src=%s/IMAGES/%s.jpg alt=geneFusionImage>\n", confp_get(Conf, "WEB_DATA_LINK"), currGE->id);
      puts ("</td>");
      puts ("<td>");
      puts ("<h2>Transcript connectivity table</h2><br>");
      puts ("<table border=0>");
      puts ("<tr align=left>");
      puts ("<th width=200>Pair Type</th>");
      puts ("<th width=200>Entry transcript 1</th>");
      puts ("<th width=200>Entry transcript 2</th>");
      puts ("<th width=200>Counts</th>");
      puts ("</tr>");
      fflush( stdout );
      for (i = 0; i < arrayMax (currGE->pairCounts); i++) {
        currGEPC = arrp (currGE->pairCounts,i,GfrPairCount);	
        printf ("<tr><td>%s</td><td>%s</td><td>%s</td><td>%.2f</td></tr>\n", 
		        getPairTypeName(currGEPC->pairType), 
		        getEntryNumber(currGEPC->number1, currGEPC->pairType, 1),
		        getEntryNumber(currGEPC->number2, currGEPC->pairType, 2),
		        currGEPC->count);
      }
      puts ("</table>");
      puts ("</td>");
      puts ("</tr>");
      puts ("</table>");
      puts ("<br>");

      puts ("<h2>Transcript information</h2><br>");
      puts ("<table border=1 cellpadding=10 width=\"80%\">");
      puts ("<tr align=left>");
      puts ("<th width=\"20%\"></th>");
      puts ("<th><font color='blue'>Transcript 1</font></th>");
      puts ("<th><font color='orange'>Transcript 2</font></th>");
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Gene symbol(s)</b></td>");
      printf ("<td width=\"30%%\"><font color='blue'>%s</font></td>\n",processString (currGE->geneSymbolTranscript1));
      printf ("<td width=\"30%%\"><font color='orange'>%s</font></td>\n",processString (currGE->geneSymbolTranscript2));
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Coordinates</b></td>");
      printf ("<td width=\"30%%\">%s:%d-%d</td>\n",currGE->chromosomeTranscript1,currGE->startTranscript1,currGE->endTranscript1);
      printf ("<td width=\"30%%\">%s:%d-%d</td>\n",currGE->chromosomeTranscript2,currGE->startTranscript2,currGE->endTranscript2);
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Strand</b></td>");
      printf ("<td width=\"30%%\">%c</td>\n",currGE->strandTranscript1);
      printf ("<td width=\"30%%\">%c</td>\n",currGE->strandTranscript2);
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Gene description(s)</b></td>");
      printf ("<td width=\"30%%\">%s</td>\n",processString (currGE->descriptionTranscript1));
      printf ("<td width=\"30%%\">%s</td>\n",processString (currGE->descriptionTranscript2));
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Number of exons</b></td>");
      printf ("<td width=\"30%%\">%d</td>\n",currGE->numExonsTranscript1);
      printf ("<td width=\"30%%\">%d</td>\n",currGE->numExonsTranscript2);
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Number of intra paired-end reads</b></td>");
      printf ("<td width=\"30%%\">%d</td>\n",currGE->numIntra1);
      printf ("<td width=\"30%%\">%d</td>\n",currGE->numIntra2);
      puts ("</tr>");
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Links</b></td>");
      printf ("<td width=\"30%%\">[<a href=%s&hgt.customText=%s/BED/%s_1.bed target=blank>UCSC genome browser</a>]&nbsp;&nbsp;&nbsp;[<a href=%s/FASTA/%s_1.fasta>FASTA file</a>]<br></td>\n",
              htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
		      currGE->chromosomeTranscript1,
		      currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
		      currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
              confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id,
	      confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id); 
      printf ("<td width=\"30%%\">[<a href=%s&hgt.customText=%s/BED/%s_2.bed target=blank>UCSC genome browser</a>]&nbsp;&nbsp;&nbsp;[<a href=%s/FASTA/%s_2.fasta>FASTA file</a>]<br></td></tr>\n",
              htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
		      currGE->chromosomeTranscript2,
		      currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
		      currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
              confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id,
	      confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id); 
      
      puts ("<tr align=left>");
      puts ("<td width=\"20%\"><b>Expression</b></td>"); 

      stringPrintf(buffer, "%s/BGRS/%s_%s.bgr.gz", 
		   confp_get(Conf, "WEB_DATA_DIR"),
		   argv[1],
		   currGE->chromosomeTranscript1);  
      ftmp = fopen( string(buffer), "r" ); // displaying this only if data are present
      puts("<td width=\"30%\">");
      if( ftmp ) {
	printf ("[<a href=%s&hgt.customText=%s/BGRS/%s_%s.bgr.gz target=blank>Expression %s</a>]",
		htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
			currGE->chromosomeTranscript1,
			currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
		confp_get(Conf, "WEB_DATA_LINK"),
		argv[1],
		currGE->chromosomeTranscript1,
		currGE->chromosomeTranscript1); 
	fclose(ftmp);
      }
      puts("</td>");

      stringPrintf(buffer, "%s/BGRS/%s_%s.bgr.gz", confp_get(Conf, "WEB_DATA_DIR"),argv[1],currGE->chromosomeTranscript2); 
      ftmp = fopen( string(buffer), "r" ); // displaying this only if data are present
      puts("<td width=\"30%\">");
      if( ftmp ) {
	printf ("[<a href=%s&hgt.customText=%s/BGRS/%s_%s.bgr.gz target=blank>Expression %s</a>]",
		htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
			currGE->chromosomeTranscript2,
			currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
		confp_get(Conf, "WEB_DATA_LINK"),
		argv[1],
		currGE->chromosomeTranscript2,
		currGE->chromosomeTranscript2); 
	fclose(ftmp);
      } 
      puts("</td>");
      puts("</tr>");
      puts ("</table><br><br>");
      
      puts ("<h2>Breakpoint analysis</h2><br>");
      puts ("<table border=1 width=\"80%\" cellpadding=10><thead><tr><th>Orientation</th><th>Alignments</th><th colspan=2>Breakpoints</th></tr></thead><tbody>");
      puts ("<tr><td>Orientation AB</td>");
	if (currGE->strandTranscript1=='+') {
	  currGE->strandTranscript2=='+' ? stringPrintf(buffer, "AB_trans1F_trans2F") : stringPrintf(buffer, "AB_trans1F_trans2R");
	} else if( currGE->strandTranscript1 == '-') {
	  currGE->strandTranscript2=='+' ? stringPrintf(buffer, "AB_trans1R_trans2F") : stringPrintf(buffer, "AB_trans1R_trans2R");
	} else {
	  die("Strand informatation is not correct (transcript 1): %c", currGE->strandTranscript1);
	}
	printf ("<td align=center><a href=%s/ALIGNMENTS/%s_AB_breakPointAlignments.txt><img src=%s/IMAGES/%s.png></img>&nbsp;AB</a></td>", 
		confp_get(Conf, "WEB_DATA_LINK"), 
		currGE->id, 
		confp_get(Conf, "WEB_DATA_LINK"), 
		string(buffer)); 
	printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_AB_breakPointsTranscript1.wig target=blank>Breakpoints transcript 1 UCSC Genome Browser</a></td>", 
		htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
			currGE->chromosomeTranscript1,
			currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
		confp_get(Conf, "WEB_DATA_LINK"),
		currGE->id);
	printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_AB_breakPointsTranscript2.wig target=blank>Breakpoints transcript 2 UCSC Genome Browser</a></td></tr>", 
		htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
			currGE->chromosomeTranscript2,
			currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
			currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
		confp_get(Conf, "WEB_DATA_LINK"),
		currGE->id);	
      fflush(stdout);
      puts   ("<tr><td>Orientation BA</td>");  
      if (currGE->strandTranscript1 == '+') {
	currGE->strandTranscript2=='+' ? stringPrintf(buffer, "BA_trans1F_trans2F") : stringPrintf(buffer, "BA_trans1F_trans2R");
      } else if( currGE->strandTranscript1 == '-') {
	currGE->strandTranscript2=='+' ? stringPrintf(buffer, "BA_trans1R_trans2F") : stringPrintf(buffer, "BA_trans1R_trans2R");
      } else {
	die("Strand informatation is not correct (transcript2): %c", currGE->strandTranscript2);
	}	
      printf ("<td align=center><a href=%s/ALIGNMENTS/%s_BA_breakPointAlignments.txt><img src=%s/IMAGES/%s.png></img>&nbsp;BA</a></td>",
	      confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id, 
	      confp_get(Conf, "WEB_DATA_LINK"),
	      string(buffer));
      printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_BA_breakPointsTranscript2.wig target=blank>Breakpoints transcript 2 UCSC Genome Browser</a></td>",	
	      htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
		      currGE->chromosomeTranscript2,
		      currGE->startTranscript2 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
	      	      currGE->endTranscript2 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
	      confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id);
      printf ("<td align=center><a href=%s&hgt.customText=%s/WIGS/%s_BA_breakPointsTranscript1.wig target=blank>Breakpoints transcript 1 UCSC Genome Browser</a></td></tr>", 
	      htmlLinker_generateLinkToGenomeBrowserAtUCSC ("hg18","vertebrate","human",
		      currGE->chromosomeTranscript1,
		      currGE->startTranscript1 - atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION")),
		      currGE->endTranscript1 + atoi(confp_get(Conf, "UCSC_GENOME_BROWSER_FLANKING_REGION"))),
	      confp_get(Conf, "WEB_DATA_LINK"),
	      currGE->id);       

      puts ("</tbody></table>");
      puts ("<br><br><br>");
      fflush(stdout);
    
    
      puts ("<h2>Read coordinates</h2><br>");
      puts ("<table border=0>");
      puts ("<tr align=left>");
      puts ("<th width=\"10%\">Pair Type</th>");
      puts ("<th width=\"10%\">Entry Transcript 1</th>");
      puts ("<th width=\"10%\">Read start transcript 1</th>");
      puts ("<th width=\"10%\">Read end transcript 1</th>");
      puts ("<th width=\"10%\">Entry Transcript 2</th>");
      puts ("<th width=\"10%\">Read start transcript 2</th>");
      puts ("<th width=\"10%\">Read end transcript 2</th>");
      puts ("</tr>");     
      for (i = 0; i < arrayMax (currGE->interReads); i++) {
	currGIR = arrp (currGE->interReads,i,GfrInterRead);
	printf ("<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%s</td><td>%d</td><td>%d</td></tr>\n",
		getPairTypeName(currGIR->pairType), 
		getEntryNumber(currGIR->number1, currGIR->pairType, 1),
		currGIR->readStart1,currGIR->readEnd1,
		getEntryNumber(currGIR->number2,currGIR->pairType, 2),
		currGIR->readStart2,
		currGIR->readEnd2);
      }
      puts ("</table><br><br><br>");
      puts ("</body>");
      puts ("</html>");
    fflush (stdout);
    }
  }
  confp_close(Conf);
  
  return EXIT_SUCCESS;
}
int main (int argc, char *argv[])
{
  GfrEntry *currGE;
  BLEntry *currBLE;
  BLEntry currQuery;
  FILE *fp;
  char *line;
  int count;
  int countRemoved;
  
  int index;
  WordIter w;
  Array blackList = arrayCreate(20, BLEntry);
  config *Conf;

  if ((Conf = confp_open(getenv("FUSIONSEQ_CONFPATH"))) == NULL) {
    die("%s:\tCannot find .fusionseqrc: %s", argv[0], getenv("FUSIONSEQ_CONFPATH"));
    return EXIT_FAILURE;
  }
  if( confp_get( Conf, "ANNOTATION_DIR")==NULL ) {
    die("%s:\tCannot find ANNOTATION_DIR in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  if( confp_get( Conf, "BLACKLIST_FILENAME")==NULL ) {
    die("%s:\tCannot find BLACKLIST_FILENAME in the configuration file: %s)", argv[0], getenv("FUSIONSEQ_CONFPATH") );
    return EXIT_FAILURE;
  }
  Stringa buffer=stringCreate( 100 );
  stringPrintf( buffer, "%s/%s", confp_get( Conf, "ANNOTATION_DIR"), confp_get( Conf, "BLACKLIST_FILENAME") );
  /*  fp = fopen( string( buffer ), "r" );
  if( !fp )  die("Unable to open file: %s", string(buffer));
  stringDestroy( buffer );
  */ 
// reading blacklist file
  LineStream ls = ls_createFromFile( string(buffer) );
  while( line = ls_nextLine(ls) ) {
    w = wordIterCreate( line, "\t", 1);
    currBLE = arrayp( blackList, arrayMax(blackList), BLEntry);
    currBLE->gene1 = hlr_strdup ( wordNext(w) );
    currBLE->gene2 = hlr_strdup ( wordNext(w) );    
    wordIterDestroy(w);
  }
  //fclose(fp);
  ls_destroy( ls );
  stringDestroy( buffer );
  arraySort( blackList, (ARRAYORDERF) sortBlackListByName1);

  // beginFiltering
  count = 0;
  countRemoved = 0;
  gfr_init ("-");
  puts (gfr_writeHeader ());
  while (currGE = gfr_nextEntry ()) { // reading the gfr
    if( currGE->geneSymbolTranscript1 == NULL ) {
      die("Gene symbols are not present in the GFR file. Please run gfrAddInfo before gfrBlackListFilter.");
      return EXIT_FAILURE;
    }
	
    // creating a new query to the black list
    currQuery.gene1 = currGE->geneSymbolTranscript1;
    currQuery.gene2 = currGE->geneSymbolTranscript2;
    if( strEqual( currQuery.gene1 , currQuery.gene2 ) ) {
	countRemoved++;
	continue;
      }
    // searching against read_1/read_2
    int res = arrayFind( blackList, &currQuery, 
			 &index,  (ARRAYORDERF) sortBlackListByName1);  
    
    if( !res ) { // not found, then searching against read_2/read_1
      currQuery.gene1 = currGE->geneSymbolTranscript2;
      currQuery.gene2 = currGE->geneSymbolTranscript1;
      
      res =  arrayFind( blackList, &currQuery, 
			&index, (ARRAYORDERF) sortBlackListByName1 );
      
      if( !res ) { // not found, write the instance to stdout, update the counts
	puts (gfr_writeGfrEntry (currGE));
	count++;	
      } else { // found: read2/read1
	countRemoved++;
      }	
    } else { //found: read1/read2
      countRemoved++;
    }
  }	           
  gfr_deInit ();
  arrayDestroy( blackList );
  warn ("%s_BlackListFilter: %s",argv[0], confp_get( Conf, "BLACKLIST_FILENAME"));
  warn ("%s_numRemoved: %d",argv[0],countRemoved);
  warn ("%s_numGfrEntries: %d",argv[0],count);
  confp_close( Conf);
  return 0;
}
Beispiel #14
0
void incl_getExonHlightFile (FILE *fp, Array regions, char *sdata_dir)
{
  LineStream src;
  FILE *out;
  char *line;
  Texta entry;
  int i, astart, aend;

  Stringa buffer = stringCreate (50);

  stringPrintf (buffer, "%s/tmp/exons.hlight_s.txt", sdata_dir);
  if (!(out = fopen (string (buffer), "w"))) {
	fprintf (stderr, "Cannot open exons.hlight_s.txt\n");
	return;
  }

  SRegion_t *tmp;
  tmp = arrayp (regions, 0, SRegion_t);

  if (tmp->chromosome == 0) {
	fprintf (fp, "file = %s/exons.hlight.txt\n", sdata_dir);
  }
  else {
	for (i = 0; i < arrayMax (regions); i++) {
	  tmp = arrayp (regions, i, SRegion_t);

	  if (tmp->chromosome == 23) {
		stringPrintf (buffer, "%s/X/exons.hlight.txt", sdata_dir);
	  }
	  else if (tmp->chromosome == 24) {
		stringPrintf (buffer, "%s/Y/exons.hlight.txt", sdata_dir);
	  }
	  else {
		stringPrintf (buffer, "%s/%i/exons.hlight.txt", sdata_dir, tmp->chromosome);
	  }

	  if ((src = ls_createFromFile (string (buffer))) == NULL) {
		fprintf (stderr, "Cannot open exons.hlight.txt\n");
		return;
	  }

	  while ((line = ls_nextLine (src)) != NULL) {
		entry = textFieldtokP (line, " ");

		astart = atoi (textItem (entry, 1));
		aend   = atoi (textItem (entry, 2));

		if ((astart >= tmp->start && astart <= tmp->end) ||
			(aend >= tmp->start && aend <= tmp->end)) {
		  fprintf (out, "%s\n", line);
		}
		textDestroy (entry);
	  }
	}

	fprintf (fp, "file = %s/tmp/exons.hlight_s.txt\n", sdata_dir);
  }

  stringDestroy (buffer);
  ls_destroy (src);
  fclose (out);
}
Beispiel #15
0
typedef struct {
  char* chrom ;			/* chromosome name */
  Array x ;			/* of int - base pair coordinates */
  Array g ;			/* of double - genetic map position */
  int x0 ;			/* bounds of map in base pair coordinates */
  Array z ;			/* genetic map position every 100bp from x0 */
} GeneticMap ;

static GeneticMap map ;

/****************************/

static void buildMap (void)
{
  map.x0 = arr(map.x, 0, int) ;
  int n = (arr(map.x, arrayMax(map.x)-1, int) - map.x0) / 100 ;
  map.z = arrayReCreate (map.z, n, double) ;
  int i = 0, *mapx = arrp(map.x,0,int) ; 
  double *mapg = arrp(map.g,0,double) ;
  array(map.z,i,double) = 0.0 ;
  while (i++ < n)
    { int xi = map.x0 + 100*i ;
      while (mapx[1] < xi) { ++mapx ; ++mapg ; }
      array(map.z,i,double) = 
	*mapg + (xi - mapx[0]) * (mapg[1] - mapg[0]) / (mapx[1] - mapx[0]) ;
    }
}

/****************************/

void readGeneticMap (FILE *fp)
Beispiel #16
0
int main(int argc, char *argv[])
{
	Array breakPoints;
	BreakPoint *currBP;
	BreakPointRead *currBPR;
	int minNumReads, minNumUniqueOffsets,
	    minNumReadsForKS,numPossibleOffsets;
	double pValueCutoffForKS;
	Array offsets;
	Array randomNumbers;
	double *observedOffsets;
	double *randomOffsets;

 	if (argc != 6) {
		usage((char*) "%s <minNumReads> <minNumUniqueOffsets> "
              "<minNumReadsForKS> <pValueCutoffForKS> <numPossibleOffsets>", 
              argv[0]);
    }
	
	minNumReads         = std::atoi(argv[1]);
	minNumUniqueOffsets = std::atoi(argv[2]);
	minNumReadsForKS    = std::atoi(argv[3]);
	pValueCutoffForKS   = std::atof(argv[4]);
	numPossibleOffsets  = std::atoi(argv[5]);
	bp_init("-");
	offsets = arrayCreate(100, int);
	randomNumbers = arrayCreate(100, int);
	breakPoints = bp_getBreakPoints();

	for (int i = 0; i < arrayMax(breakPoints); i++) {
		currBP = arrp(breakPoints, i, BreakPoint);
		arrayClear(offsets);
		for (int j = 0; j < arrayMax(currBP->breakPointReads); j++) {
			currBPR = arrp(currBP->breakPointReads, j, BreakPointRead);
			array(offsets, arrayMax(offsets), int) = currBPR->offset;
		}
		arraySort(offsets, (ARRAYORDERF) arrayIntcmp);
		arrayUniq(offsets, NULL, (ARRAYORDERF) arrayIntcmp);
		if (arrayMax(currBP->breakPointReads) >= minNumReads && 
		    arrayMax(currBP->breakPointReads) < minNumReadsForKS) {        
			if (arrayMax(offsets) >= minNumUniqueOffsets)
				std::puts(bp_writeBreakPoint(currBP));
		}
		else if (arrayMax(currBP->breakPointReads) >= minNumReads && 
			 arrayMax(currBP->breakPointReads) >= minNumReadsForKS) {
			arrayClear(randomNumbers);
			for (int j = 0; j < arrayMax(offsets); j++)
				array(randomNumbers, arrayMax(randomNumbers), int) = std::rand() % numPossibleOffsets;
			
			arraySort(randomNumbers, (ARRAYORDERF) arrayIntcmp);
			observedOffsets = (double *) hlr_malloc(arrayMax(offsets) * sizeof(double)); 
			randomOffsets = (double *) hlr_malloc(arrayMax(offsets) * sizeof(double)); 
			for (int j = 0; j < arrayMax(offsets); j++) {
				observedOffsets[j] = arru(offsets, j, int);
				randomOffsets[j] = arru(randomNumbers, j, int);
			}
			if (pValueCutoffForKS < TMath::KolmogorovTest(arrayMax(offsets), 
								      observedOffsets, 
								      arrayMax(offsets), 
								      randomOffsets, 
								      ""))
				std::puts(bp_writeBreakPoint(currBP));
			
			hlr_free(observedOffsets);
			hlr_free(randomOffsets);
		}
	}