/* coordstart used only if centerp or tallyp is true */ static long int print_interval (Chrpos_T *lastcoord, long int total, char *divstring, Chrpos_T coordstart, Chrpos_T coordend, int index, IIT_T iit, int ndivs, int fieldint) { Interval_T interval; char *label, *annotation, *restofheader; bool allocp; if (centerp == true) { print_interval_centered(divstring,coordstart,index,iit,fieldint); return 0; } else if (tallyp == true) { total += print_interval_tally(&(*lastcoord),divstring,coordstart,coordend,index,iit,zeroesp); return total; } else if (runlengthp == true) { print_interval_runlength(&(*lastcoord),divstring,coordstart,coordend,index,iit,zeroesp); return 0; } if (annotationonlyp == false) { label = IIT_label(iit,index,&allocp); printf(">%s ",label); if (allocp == true) { FREE(label); } if (ndivs > 1) { if (divstring == NULL) { /* For example, if interval was retrieved by label */ divstring = IIT_divstring_from_index(iit,index); } printf("%s:",divstring); } debug(printf("index is %d\n",index)); interval = IIT_interval(iit,index); if (signedp == false) { printf("%u..%u",Interval_low(interval),Interval_high(interval)); } else if (Interval_sign(interval) < 0) { printf("%u..%u",Interval_high(interval),Interval_low(interval)); } else { printf("%u..%u",Interval_low(interval),Interval_high(interval)); } if (Interval_type(interval) > 0) { printf(" %s",IIT_typestring(iit,Interval_type(interval))); } #if 0 /* Unnecessary because of "\n" after restofheader below */ if (IIT_version(iit) < 5) { printf("\n"); } #endif } if (fieldint < 0) { annotation = IIT_annotation(&restofheader,iit,index,&allocp); printf("%s\n",restofheader); printf("%s",annotation); if (allocp == true) { FREE(restofheader); } } else { annotation = IIT_annotation(&restofheader,iit,index,&allocp); printf("%s\n",restofheader); if (allocp == true) { FREE(restofheader); } annotation = IIT_fieldvalue(iit,index,fieldint); printf("%s\n",annotation); FREE(annotation); } return 0; }
/* Puts reference genome into refgenome_fp (assume compressed), and puts alternate strain sequences into altstrain_iit. */ static void genome_write_memory (FILE *refgenome_fp, FILE *input, IIT_T contig_iit, IIT_T altstrain_iit, UINT4 *genomecomp, unsigned int nuint4, char *fileroot) { char Buffer[BUFFERSIZE], Complement[BUFFERSIZE], *segment; char *accession, *p; Genomicpos_T leftposition, rightposition, startposition, endposition, truelength, maxposition = 0, currposition = 0; int contigtype; bool revcompp; #ifdef ALTSTRAIN int altstrain_index, altstrain_offset; #endif int nbadchars = 0; int ncontigs = 0; while (fgets(Buffer,BUFFERSIZE,input) != NULL) { if (Buffer[0] == '>') { /* HEADER */ accession = parse_accession(Buffer); find_positions(&revcompp,&leftposition,&rightposition,&startposition,&endposition, &truelength,&contigtype,accession,contig_iit); if (++ncontigs < 50) { if (revcompp == true) { fprintf(stderr,"Writing contig %s to universal coordinates %u..%u in genome %s\n", accession,startposition,endposition,fileroot); } else { fprintf(stderr,"Writing contig %s to universal coordinates %u..%u in genome %s\n", accession,startposition+1U,endposition+1U,fileroot); } } else if (ncontigs == 50) { fprintf(stderr,"More than 50 contigs. Will stop printing messages\n"); } if (contigtype > 0) { #ifdef ALTSTRAIN fprintf(stderr," (alternate strain %s)",IIT_typestring(altstrain_iit,contigtype)); #endif } FREE(accession); if (contigtype > 0) { #ifdef ALTSTRAIN /* Initialize file pointer for alternate strain */ altstrain_index = IIT_get_exact(altstrain_iit,/*divstring*/NULL,leftposition,rightposition,contigtype); if (revcompp == true) { altstrain_offset = rightposition + 1U - leftposition; } else { altstrain_offset = 0; } debug(printf("Setting altstrain_offset to be %d\n",altstrain_offset)); #endif } /* Handles case where true length is greater than provided coordinates. This needs to be after call to IIT_get_exact */ if (leftposition + truelength - 1U > rightposition) { debug(fprintf(stderr,"Extending endposition for truelength of %u\n",truelength)); rightposition = leftposition + truelength; if (revcompp == true) { endposition = startposition - truelength; } else { endposition = startposition + truelength; } } /* In both cases, set file pointer for reference strain, although we won't write sequence of alternate strain. For an alternate strain, ensure that we fill the reference strain with sufficient X's. */ if (startposition > maxposition) { /* Start beyond end of file */ debug(printf("Filling with X's from %u to %u-1\n",maxposition,startposition)); fill_x_memory(genomecomp,maxposition,startposition); if (contigtype > 0) { #ifdef ALTSTRAIN fill_x_memory(genomecomp,leftposition,rightposition + 1); maxposition = currposition = rightposition + 1; #endif } else { maxposition = rightposition; currposition = startposition; } } else { /* Start within file */ if (contigtype > 0) { #ifdef ALTSTRAIN if (rightposition + 1 > maxposition) { debug(printf("Filling with X's from %u to %u-1\n",maxposition,rightposition+1)); fill_x_memory(genomecomp,maxposition,rightposition + 1); maxposition = currposition = rightposition + 1; } #endif } else { debug(printf("Moving to %u\n",startposition)); currposition = startposition; } } } else { /* SEQUENCE */ if ((p = rindex(Buffer,'\n')) != NULL) { *p = '\0'; } if ((p = rindex(Buffer,CONTROLM)) != NULL) { *p = '\0'; } if (revcompp == true) { make_complement_buffered(Complement,Buffer,strlen(Buffer)); segment = Complement; } else { segment = Buffer; } if (contigtype > 0) { #ifdef ALTSTRAIN /* Write alternate strain */ if (revcompp == true) { altstrain_offset -= strlen(segment); debug(printf("Writing alternate strain at %u\n",altstrain_offset)); IIT_backfill_sequence(altstrain_iit,altstrain_index,altstrain_offset,segment); } else { debug(printf("Writing alternate strain at %u\n",altstrain_offset)); IIT_backfill_sequence(altstrain_iit,altstrain_index,altstrain_offset,segment); altstrain_offset += strlen(segment); } #endif } else { /* Write reference strain */ if (revcompp == true) { debug(printf("Filling with sequence from %u-1 to %u\n",currposition,currposition-strlen(segment))); currposition -= strlen(segment); nbadchars = Compress_update_memory(nbadchars,genomecomp,segment,currposition,currposition+strlen(segment)); } else { debug(printf("Filling with sequence from %u to %u-1\n",currposition,currposition+strlen(segment))); nbadchars = Compress_update_memory(nbadchars,genomecomp,segment,currposition,currposition+strlen(segment)); currposition += strlen(segment); if (currposition > maxposition) { maxposition = currposition; } } } } } move_absolute(refgenome_fp,0U); FWRITE_UINTS(genomecomp,nuint4,refgenome_fp); fprintf(stderr,"A total of %d non-ACGTNX characters were seen in the genome.\n",nbadchars); return; }
/* Need to store just the part of the query specified (e.g., 1..10) */ static void print_interval_centered (char *divstring, Chrpos_T coordstart, int index, IIT_T iit, int fieldint) { Interval_T interval; char *label, *annotation, *restofheader, centerchar; bool allocp; int annotlength, left, centerpos; if (fieldint < 0) { annotation = IIT_annotation(&restofheader,iit,index,&allocp); if (allocp == true) { FREE(restofheader); } } else { annotation = IIT_fieldvalue(iit,index,fieldint); allocp = true; } annotlength = strlen(annotation); if (annotation[annotlength-1] == '\n') { annotlength--; } interval = IIT_interval(iit,index); left = coordstart - Interval_low(interval); /* + length(query) - queryend */ if (Interval_sign(interval) < 0) { centerpos = annotlength-left-1; } else { centerpos = left; } centerchar = annotation[centerpos]; if (centeruc == true && islower(centerchar)) { if (fieldint >= 0 && allocp == true) { FREE(annotation); } } else { print_spaces(centerlength-left); if (Interval_sign(interval) < 0) { print_complement(annotation,annotlength-1,centerpos+1); printf("[%c]",complCode[(int) centerchar]); print_complement(annotation,centerpos-1,0); } else { print_forward(annotation,0,centerpos-1); printf("[%c]",centerchar); print_forward(annotation,centerpos+1,annotlength-1); } print_spaces(centerlength+left-annotlength); if (fieldint >= 0 && allocp == true) { FREE(annotation); } printf("\t"); if (Interval_type(interval) > 0) { printf("%s\t",IIT_typestring(iit,Interval_type(interval))); } if (divstring != NULL) { if (Interval_sign(interval) < 0) { printf("-%s:",divstring); } else { printf("+%s:",divstring); } } if (signedp == false) { printf("%u..%u",Interval_low(interval),Interval_high(interval)); } else if (Interval_sign(interval) < 0) { printf("%u..%u",Interval_high(interval),Interval_low(interval)); } else { printf("%u..%u",Interval_low(interval),Interval_high(interval)); } printf("\t"); label = IIT_label(iit,index,&allocp); printf("%s",label); if (allocp == true) { FREE(label); } printf("\n"); } return; }
/* Permits arbitrary ASCII characters. Useful for storing numeric data */ static void genome_writeraw_file (FILE *refgenome_fp, FILE *input, IIT_T contig_iit, IIT_T altstrain_iit, char *fileroot, int index1part) { char Buffer[BUFFERSIZE], Reverse[BUFFERSIZE], *segment; char *accession, c; Genomicpos_T leftposition, rightposition, startposition, endposition, truelength, maxposition = 0, currposition = 0; int contigtype; int strlength; int i; bool revcompp; #ifdef ALTSTRAIN int altstrain_index, altstrain_offset; #endif int ncontigs = 0; for (i = 0; i < WRITEBLOCK; i++) { Empty[i] = 0; } while (fgets(Buffer,BUFFERSIZE,input) != NULL) { if (Buffer[0] != '>') { fprintf(stderr,"Expecting to see a new FASTA entry\n"); fprintf(stderr,"Instead, saw %d (%c)\n",(int) Buffer[0],Buffer[0]); exit(9); } else { /* HEADER */ accession = parse_accession(Buffer); find_positions(&revcompp,&leftposition,&rightposition,&startposition,&endposition, &truelength,&contigtype,accession,contig_iit); if (++ncontigs < 50) { if (revcompp == true) { fprintf(stderr,"Writing contig %s to universal coordinates %u..%u in genome %s\n", accession,startposition,endposition,fileroot); } else { fprintf(stderr,"Writing contig %s to universal coordinates %u..%u in genome %s\n", accession,startposition+1U,endposition+1U,fileroot); } } else if (ncontigs == 50) { fprintf(stderr,"More than 50 contigs. Will stop printing messages\n"); } if (contigtype > 0) { #ifdef ALTSTRAIN fprintf(stderr," (alternate strain %s)",IIT_typestring(altstrain_iit,contigtype)); #endif } FREE(accession); if (contigtype > 0) { #ifdef ALTSTRAIN /* Initialize file pointer for alternate strain */ altstrain_index = IIT_get_exact(altstrain_iit,/*divstring*/NULL,leftposition,rightposition,contigtype); if (revcompp == true) { altstrain_offset = rightposition + 1U - leftposition; } else { altstrain_offset = 0; } debug(printf("Setting altstrain_offset to be %d\n",altstrain_offset)); #endif } /* Handles case where true length is greater than provided coordinates. This needs to be after call to IIT_get_exact */ if (leftposition + truelength - 1U > rightposition) { debug(fprintf(stderr,"Extending endposition for truelength of %u\n",truelength)); rightposition = leftposition + truelength; if (revcompp == true) { endposition = startposition - truelength; } else { endposition = startposition + truelength; } } /* In both cases, set file pointer for reference strain, although we won't write sequence of alternate strain. For an alternate strain, ensure that we fill the reference strain with sufficient X's. */ if (startposition > maxposition) { /* Start beyond end of file */ debug(printf("Filling with zeroes from %u to %u-1\n",maxposition,startposition)); fill_zero(refgenome_fp,maxposition,startposition,/*uncompressedp*/true,index1part); if (contigtype > 0) { #ifdef ALTSTRAIN fill_zero(refgenome_fp,leftposition,rightposition + 1,/*uncompressedp*/true,index1part); maxposition = currposition = rightposition + 1; #endif } else { maxposition = rightposition; currposition = startposition; } } else { /* Start within file */ if (contigtype > 0) { #ifdef ALTSTRAIN if (rightposition + 1 > maxposition) { debug(printf("Filling with zeroes from %u to %u-1\n",maxposition,rightposition+1)); fill_zero(refgenome_fp,maxposition,rightposition + 1,/*uncompressedp*/true,index1part); maxposition = currposition = rightposition + 1; } #endif } else { debug(printf("Moving to %u\n",startposition)); move_absolute(refgenome_fp,startposition); currposition = startposition; } } /* SEQUENCE */ fprintf(stderr,"Processing %u characters\n",truelength); while (truelength > 0) { if (truelength > BUFFERSIZE) { if ((strlength = fread(Buffer,sizeof(char),BUFFERSIZE,input)) < BUFFERSIZE) { fprintf(stderr,"Expecting %u more characters, but saw end of file\n",truelength); exit(9); } truelength -= BUFFERSIZE; } else { if ((strlength = fread(Buffer,sizeof(char),truelength,input)) < truelength) { fprintf(stderr,"Expecting %u more characters, but saw end of file\n",truelength); exit(9); } truelength = 0; } if (revcompp == true) { make_reverse_buffered(Reverse,Buffer,strlength); segment = Reverse; } else { segment = Buffer; } if (contigtype > 0) { #ifdef ALTSTRAING /* Write alternate strain. It is likely that IIT commands will fail because they depend on \0 to terminate the segment. */ if (revcompp == true) { altstrain_offset -= strlength; debug(printf("Writing alternate strain at %u\n",altstrain_offset)); IIT_backfill_sequence(altstrain_iit,altstrain_index,altstrain_offset,segment); } else { debug(printf("Writing alternate strain at %u\n",altstrain_offset)); IIT_backfill_sequence(altstrain_iit,altstrain_index,altstrain_offset,segment); altstrain_offset += strlength; } #endif } else { /* Write reference strain */ if (revcompp == true) { debug(printf("Filling with sequence from %u-1 to %u\n",currposition,currposition-strlength)); currposition -= strlength; fwrite(segment,sizeof(char),strlength,refgenome_fp); } else { debug(printf("Filling with sequence from %u to %u-1\n",currposition,currposition+strlength)); fwrite(segment,sizeof(char),strlength,refgenome_fp); currposition += strlength; if (currposition > maxposition) { maxposition = currposition; } } } } if ((c = fgetc(input)) != EOF && c != '\n') { fprintf(stderr,"Expecting linefeed at end of sequence. Saw %d instead\n", c); exit(9); } } } return; }