int count_sequences_stockholm(char* string) { char* p1 = string; int i = 0; int j = 0; int n = 0; while((i = byg_end("\n",p1))!=-1){ p1+=i; if (!(byg_start("//",p1))){ break; } j = byg_end("#",p1); if(j != 1){ n++; } } if(!n){ return 0; } return n; }
int count_sequences_clustalw(char* string) { char* p1 = string; int i = 0; int j = 0; int c = 0; int n = 0; int f = 0; while((i = byg_end("\n",p1))!=-1){ p1+=i; j = byg_end(" ",p1); f = byg_end("\n",p1); if(f > 2 && f>j && j!= 1){ if(c ==0){ i = j; while(p1[i] != '\n'){ //if (!isspace((int)p1[i])){ // len++; //} i++; } } c++; }else{ if (c){ if(c > n){ n = c; } c =0; } } } if(!n){ return 0; } return n; }
struct alignment* read_sequences_uniprot_xml(struct alignment* aln,char* string) { int c = 0; int n = 0; int i = 0; int j = 0; char *p1 = 0; int aacode[26] = {0,1,2,3,4,5,6,7,8,-1,9,10,11,12,23,13,14,15,16,17,17,18,19,20,21,22}; //int aacode[26] = {0,1,2,3,4,5,6,7,8,-1,9,10,11,12,-1,13,14,15,16,17,-1,18,19,20,21,22}; /*aln = (struct alignment *) malloc(sizeof(struct alignment)); numseq = byg_count("<entry",string); if(!numseq){ k_printf("No sequences found!\n"); exit(1); } numprofiles = (numseq << 1) - 1; aln->s = malloc(sizeof(int*) * (numseq )); aln->seq = malloc(sizeof(char*) * (numseq )); aln->si = 0; aln->ft = 0; aln->sl = malloc(sizeof(int) * (numprofiles)); aln->sip = malloc(sizeof(int*)* numprofiles); aln->nsip = malloc(sizeof(int)* numprofiles); aln->sn = malloc(sizeof(char*) * numseq); aln->lsn = malloc(sizeof(int) * numseq); for (i =0;i < numprofiles;i++){ aln->sip[i] = 0; aln->nsip[i] = 0; } for(i =0;i < numseq;i++){ aln->sip[i] = malloc(sizeof(int)*1); aln->nsip[i] = 1; aln->sip[i][0] = i; }*/ p1 = string; c = 0; while(aln->sl[c]){ c++; } while((i = byg_end("<entry",p1))!=-1){ p1+=i;// p1 is at start of entry; i = byg_end("<name>",p1); p1 +=i; //p1 is at the end of the sequence name tag j = byg_start("</name>",p1); aln->lsn[c] = j; aln->sn[c] = malloc(sizeof(char)*(j+1)); for (i = 0;i < j;i++){ aln->sn[c][i] = p1[i]; } aln->sn[c][j] = 0; while((i = byg_end("<sequence",p1))!= -1 ){ i = byg_end("<sequence",p1); p1+= i; i = byg_end(">",p1); p1 +=i; } j = byg_start("</sequence>",p1); aln->s[c] = malloc(sizeof(int)*(j+1)); aln->seq[c] = malloc(sizeof(char)*(j+1)); n = 0; for (i = 0;i < j;i++){ if(isalpha((int)p1[i])){ aln->s[c][n] = aacode[toupper(p1[i])-65]; aln->seq[c][n] = p1[i]; n++; } } aln->s[c][n] = 0; aln->seq[c][n] = 0; aln->sl[c] = n; c++; } free(string); return aln; }
struct feature* read_ft(struct feature* ft,char* p) { int i,j; struct feature *n = 0; struct feature *old_n = 0; char tmp[10]; char* p1 = 0; p1 = p; while((j = byg_end("<fitem>",p1))!= -1){ i = byg_end("</seq-info>",p1); if(j >i){ break; } n = malloc(sizeof(struct feature)); n->next = 0; n->color = -1; p1+=j;// p1 is at start of entry; i = byg_end("<ftype>",p1); p1 +=i; //p1 is at the end of the sequence name tag j = byg_start("</ftype>",p1); n->type = malloc(sizeof(char*)*(j+1)); for (i = 0; i < j;i++){ n->type[i] = p1[i]; } n->type[j] = 0; i = byg_end("<fstart>",p1); p1+= i; j = byg_start("</fstart>",p1); for (i = 0; i < j;i++){ tmp[i] = p1[i]; } tmp[j] = 0; n->start = atoi(tmp); i = byg_end("<fstop>",p1); p1+= i; j = byg_start("</fstop>",p1); for (i = 0; i < j;i++){ tmp[i] = p1[i]; } tmp[j] = 0; n->end = atoi(tmp); i = byg_end("<fnote>",p1); p1+= i; j = byg_start("</fnote>",p1); n->note = malloc(sizeof(char*)*(j+1)); for (i = 0; i < j;i++){ n->note[i] = p1[i]; } n->note[j] = 0; if((old_n = ft)!= 0){ while(old_n->next!=0){ old_n = old_n->next; } old_n->next = n; }else{ ft = n; } n = 0; } return ft; }
struct alignment* read_alignment_macsim_xml(struct alignment* aln,char* string) { int c = 0; int n = 0; int i = 0; int j = 0; char *p = 0; int max = 0; int aacode[26] = {0,1,2,3,4,5,6,7,8,-1,9,10,11,12,23,13,14,15,16,17,17,18,19,20,21,22}; //int aacode[26] = {0,1,2,3,4,5,6,7,8,-1,9,10,11,12,-1,13,14,15,16,17,-1,18,19,20,21,22}; /*aln = (struct alignment*) malloc(sizeof(struct alignment)); numseq = byg_count("<seq-name>",string); if(!numseq){ k_printf("No sequences found!\n"); exit(1); } numprofiles = (numseq << 1) - 1; aln->s = malloc(sizeof(int*) * (numseq )); aln->seq = malloc(sizeof(char*) * (numseq )); aln->ft = malloc(sizeof(struct feature* ) * (numseq)); aln->si = malloc(sizeof(struct sequence_information* ) * (numseq)); aln->sl = malloc(sizeof(int) * (numprofiles)); aln->sip = malloc(sizeof(int*)* numprofiles); aln->nsip = malloc(sizeof(int)* numprofiles); aln->sn = malloc(sizeof(char*) * numseq); aln->lsn = malloc(sizeof(int) * numseq); for (i =0;i < numprofiles;i++){ aln->sip[i] = 0; aln->nsip[i] = 0; } for(i =0;i < numseq;i++){ aln->ft[i] = 0; aln->si[i] = 0; aln->sip[i] = malloc(sizeof(int)*1); aln->nsip[i] = 1; aln->sip[i][0] = i; }*/ p = string; if(byg_count("<g>",p)){ while((i = byg_start("<g>",p))!=-1){ p+=i; j = byg_end("<r>",p); for(i = 0; i< j;i++){ p[i] = ' '; } i = byg_start("</r>",p); p+=i; j = byg_end("</g>",p); for(i = 0; i< j;i++){ p[i] = ' '; } } } p = string; c = 0; while(aln->sl[c]){ c++; } while((i = byg_end("<sequence",p))!=-1){ p+=i;// p1 is at start of entry; max = byg_end("</sequence>",p); i = byg_end("<seq-name>",p); if(i < max){ p +=i; //p1 is at the end of the sequence name tag j = byg_start("</seq-name>",p); aln->lsn[c] = j; aln->sn[c] = malloc(sizeof(char)*(j+1)); for (i = 0;i < j;i++){ aln->sn[c][i] = p[i]; } aln->sn[c][j] = 0; } i = byg_end("<ftable>",p); if(i < max){ aln->ft[c] = read_ft(aln->ft[c],p); } i = byg_end("<seq-data>",p); if(i < max){ p+= i; j = byg_start("</seq-data>",p); aln->s[c] = malloc(sizeof(int)*(j+1)); aln->seq[c] = malloc(sizeof(char)*(j+1)); n = 0; for (i = 0;i < j;i++){ if((int)p[i]>32){ if(isalpha((int)p[i])){ aln->s[c][n] = aacode[toupper(p[i])-65]; }else{ aln->s[c][n] = -1; } aln->seq[c][n] = p[i]; n++; } } aln->s[c][n] = 0; aln->seq[c][n] = 0; aln->sl[c] = n; } c++; } free(string); return aln; }
struct alignment* read_alignment_from_swissprot(struct alignment* aln,char* string) { //int aacode[26] = {0,1,2,3,4,5,6,7,8,-1,9,10,11,12,-1,13,14,15,16,17,-1,18,19,20,21,22}; int aacode[26] = {0,1,2,3,4,5,6,7,8,-1,9,10,11,12,23,13,14,15,16,17,17,18,19,20,21,22}; int i,j,c,n; char* p = 0; p = string; /*numseq = byg_count("ID ",p); if(!numseq){ k_printf("No sequences found!\n"); exit(1); } aln = (struct alignment *) malloc(sizeof(struct alignment)); numprofiles = (numseq << 1) - 1; aln->ft = 0; aln->si = 0; aln->s = malloc(sizeof(int*) * (numseq )); aln->seq = malloc(sizeof(char*) * (numseq )); aln->sl = malloc(sizeof(int) * (numprofiles)); aln->sip = malloc(sizeof(int*)* numprofiles); aln->nsip = malloc(sizeof(int)* numprofiles); aln->sn = malloc(sizeof(char*) * numseq); aln->lsn = malloc(sizeof(int) * numseq); for (i =0;i < numprofiles;i++){ aln->sip[i] = 0; aln->nsip[i] = 0; } for (i = numseq;i--;){ aln->sip[i] = malloc(sizeof(int)*1); aln->nsip[i] = 1; aln->sip[i][0] = i; }*/ c = 0; while(aln->sl[c]){ c++; } k_printf("found sequence:\n"); while ((i = byg_end("ID ",p)) != -1){ p+=i; j = byg_start(" ",p); aln->lsn[c] = j; aln->sn[c] = malloc(sizeof(char)*(j+1)); for (i = 0;i < j;i++){ aln->sn[c][i] = p[i]; } aln->sn[c][j] = 0; p+= j; j = byg_end("SQ ",p); p+= j; j = byg_end("\n",p); p+= j; j = byg_start("//",p); k_printf("found sequence:\n"); aln->s[c] = malloc(sizeof(int)*(j+1)); aln->seq[c] = malloc(sizeof(char)*(j+1)); n = 0; for (i = 0;i < j;i++){ if((int)p[i] > 32){ if(isalpha((int)p[i])){ aln->s[c][n] = aacode[toupper(p[i])-65]; }else{ aln->s[c][n] = -1; } k_printf("%c",p[i]); aln->seq[c][n] = p[i]; n++; } } k_printf("\n\n"); aln->s[c][n] = 0; aln->seq[c][n] = 0; aln->sl[c] = n; c++; } free(string); return aln; }
struct alignment* read_alignment_clustal(struct alignment* aln,char* string) { int c = 0; int n = 0; int len = 0; int i = 0; int j = 0; int start = 0; char *p1 = 0; int local_numseq = 0; int aacode[26] = {0,1,2,3,4,5,6,7,8,-1,9,10,11,12,23,13,14,15,16,17,17,18,19,20,21,22}; //int aacode[26] = {0,1,2,3,4,5,6,7,8,-1,9,10,11,12,-1,13,14,15,16,17,-1,18,19,20,21,22}; //aln = (struct alignment*) malloc(sizeof(struct alignment)); p1 = string; while((i = byg_end("\n",p1))!=-1){ p1+=i; j = byg_end(" ",p1); n = byg_end("\n",p1); if(n > 2 && n>j && j!= 1){ if(c ==0){ i = j; while(p1[i] != '\n'){ if ((int)p1[i] > 32){ len++; } i++; } } c++; }else{ if (c){ if(c > local_numseq){ local_numseq = c; } c =0; } } } /*numprofiles = (numseq << 1) - 1; aln->s = malloc(sizeof(int*) * (numseq )); aln->seq = malloc(sizeof(char*) * (numseq )); aln->ft = 0; aln->si = 0; aln->sl = malloc(sizeof(int) * (numprofiles)); aln->sip = malloc(sizeof(int*)* numprofiles); aln->nsip = malloc(sizeof(int)* numprofiles); aln->sn = malloc(sizeof(char*) * numseq); aln->lsn = malloc(sizeof(int) * numseq); for (i =0;i < numprofiles;i++){ aln->sip[i] = 0; aln->nsip[i] = 0; } for(i =0;i < numseq;i++){ aln->lsn[i] = 0; aln->sip[i] = malloc(sizeof(int)*1); aln->nsip[i] = 1; aln->sip[i][0] = i; aln->sl[i] = 0;*/ start = 0; while(aln->sl[start]){ start++; } for(i =start;i < local_numseq+start;i++){ aln->s[i] = malloc(sizeof(int)*(len+1)); aln->seq[i] = malloc(sizeof(char)*(len+1)); } p1 = string; c = start; while((i = byg_end("\n",p1))!=-1){ p1+=i; j = byg_end(" ",p1); n = byg_end("\n",p1); if(n > 2 && n>j && j!= 1){ if(aln->lsn[c] == 0){ aln->lsn[c] = j; aln->sn[c] = malloc(sizeof(char)*(j+1)); for (i = 0;i < j;i++){ aln->sn[c][i] = p1[i]; } aln->sn[c][j] = 0; } for (i = j;i < n;i++){ if((int)p1[i] > 32){ if(isalpha((int)p1[i])){ aln->s[c][aln->sl[c]] = aacode[toupper(p1[i])-65]; }else{ aln->s[c][aln->sl[c]] = -1; } aln->seq[c][aln->sl[c]] = p1[i]; aln->sl[c]++; } } c++; }else{ if (c != start){ //c =0; c = start; } } } for (i = start; i < local_numseq+start;i++){ aln->s[i][aln->sl[i]] = 0; aln->seq[i][aln->sl[i]] = 0; } free(string); return aln; }
struct alignment* read_alignment_stockholm(struct alignment* aln,char* string) { int c = 0; int n = 0; int i = 0; int j = 0; char *p1 = 0; int aacode[26] = {0,1,2,3,4,5,6,7,8,-1,9,10,11,12,23,13,14,15,16,17,17,18,19,20,21,22}; //int aacode[26] = {0,1,2,3,4,5,6,7,8,-1,9,10,11,12,-1,13,14,15,16,17,-1,18,19,20,21,22}; /*aln = (struct alignment*) malloc(sizeof(struct alignment)); p1 = string; while((i = byg_end("\n",p1))!=-1){ p1+=i; if (!(byg_start("//",p1))){ break; } j = byg_end("#",p1); if(j != 1){ numseq++; } } numprofiles = (numseq << 1) - 1; aln->s = malloc(sizeof(int*) * (numseq )); aln->seq = malloc(sizeof(char*) * (numseq )); aln->ft = 0; aln->si = 0; aln->sl = malloc(sizeof(int) * (numprofiles)); aln->sip = malloc(sizeof(int*)* numprofiles); aln->nsip = malloc(sizeof(int)* numprofiles); aln->sn = malloc(sizeof(char*) * numseq); aln->lsn = malloc(sizeof(int) * numseq); for (i =0;i < numprofiles;i++){ aln->sip[i] = 0; aln->nsip[i] = 0; } for(i =0;i < numseq;i++){ aln->sip[i] = malloc(sizeof(int)*1); aln->nsip[i] = 1; aln->sip[i][0] = i; }*/ c = 0; while(aln->sl[c]){ c++; } p1 = string; while((i = byg_end("\n",p1))!=-1){ p1+=i; if (!(byg_start("//",p1))){ break; } j = byg_end("#",p1); if(j != 1){ j = byg_start(" ",p1); aln->lsn[c] = j; aln->sn[c] = malloc(sizeof(char)*(j+1)); for (i = 0;i < j;i++){ aln->sn[c][i] = p1[i]; } aln->sn[c][j] = 0; p1+=j; j = byg_start("\n",p1); aln->s[c] = malloc(sizeof(int)*(j+1)); aln->seq[c] = malloc(sizeof(char)*(j+1)); n = 0; for (i = 0;i < j;i++){ if((int)p1[i] > 32){ if(isalpha((int)p1[i])){ aln->s[c][n] = aacode[toupper(p1[i])-65]; }else{ aln->s[c][n] = -1; } aln->seq[c][n] = p1[i]; n++; } } aln->s[c][n] = 0; aln->seq[c][n] = 0; aln->sl[c] = n; c++; } } free(string); return aln; }
int read_sam_chunk(struct read_info** ri,struct parameters* param,FILE* file) { //char line[MAX_LINE]; int column = 0; int i,j,g,tmp; int c = 0; ri = clear_read_info(ri, param->num_query); char *line = NULL; size_t len = 0; ssize_t read; while ((read = getline(&line, &len, file)) != -1) { //while(fgets(line, MAX_LINE, file)){ if(line[0] != '@'){ column = 1; //<QNAME> tmp = 0; for(j = 0;j < read;j++){ tmp++; if(isspace((int)line[j])){ break; } } MMALLOC(ri[c]->name,sizeof(unsigned char)* tmp); for(j = 0;j < read;j++){ if(isspace((int)line[j])){ ri[c]->name[j] = 0; break; } ri[c]->name[j] = line[j]; } for(i = 0; i < read;i++){ if(line[i] == '\n'){ break; } if(isspace((int)line[i])){ column++; switch(column){ case 2: // <FLAG> tmp = atoi(line+i+1); ri[i]->strand = (tmp & 0x10); //WARNING - read should be reverse complemented if mapped to negative strand before tagdusting... /*tmp = atoi(line+i+1); ri[c]->strand[hit] = (tmp & 0x10); if(tmp == 4){ ri[c]->hits[hit] = 0; }else{ ri[c]->hits[hit] = 1; } hit++;*/ break; case 3: // <RNAME> break; case 4: // <POS> break; case 5: // <MAPQ> ri[c]->mapq = atof(line +i +1); break; case 6: // <CIGAR> tmp = 0; for(j = i+1;j < read;j++){ tmp++; if(isspace((int)line[j])){ break; } } ri[c]->cigar = malloc(sizeof(unsigned char)* tmp); g = 0; for(j = i+1;j < read;j++){ if(isspace((int)line[j])){ ri[c]->cigar[g] = 0; break; } ri[c]->cigar[g] = line[j]; g++; } break; case 7: // <MRNM> break; case 8: // <MPOS> break; case 9: // <ISIZE> break; case 10: // <SEQ> tmp = 0; for(j = i+1;j < read;j++){ tmp++; if(isspace((int)line[j])){ break; } } MMALLOC(ri[c]->seq,sizeof(unsigned char)* tmp); MMALLOC(ri[c]->labels,sizeof(unsigned char)* tmp); g = 0; for(j = i+1;j < read;j++){ if(isspace((int)line[j])){ ri[c]->seq[g] = 0; ri[c]->labels[g] = 0; break; } ri[c]->seq[g] = nuc_code[(int)line[j]]; ri[c]->labels[g] = 0; g++; } ri[c]->len = g; break; case 11: // <QUAL> tmp = 0; for(j = i+1;j < read;j++){ tmp++; if(isspace((int)line[j])){ break; } } g= 0; MMALLOC(ri[c]->qual,sizeof(unsigned char)* tmp); for(j = i+1;j < read;j++){ if(isspace((int)line[j])){ ri[c]->qual[g] = 0; break; } ri[c]->qual[g] = line[j]; g++; } break; default: i = (int) read; break; } } } tmp = byg_end("NM:i:", line ); if(tmp){ ri[c]->errors = atoi(line+tmp); //if(ri[c]->errors > 20){ ///fprintf(stderr,"%s\n,%c,%c,%c,%d\n",line, *(line +tmp), *(line +tmp+1),*(line +tmp+2), ri[c]->errors); //} }else{ ri[c]->errors = -1; } tmp = byg_end("MD:Z:", line ); if(tmp){ g = 0; for(j = tmp ;j < read;j++){ g++; if(isspace((int)line[j])){ break; } } ri[c]->md = malloc(sizeof(unsigned char)* g); g = 0; for(j = tmp ;j < read;j++){ if(isspace((int)line[j])){ ri[c]->md[g] = 0; break; } ri[c]->md[g] = line[j]; g++; } } //ri[c]->hits[hit] = 0xFFFFFFFFu; c++; if(c == param->num_query){ MFREE(line); return c; } } } MFREE(line); return c; }