static void readGenBank(struct ReadSeqVars *V) { char *sptr; int in_definition; while (strncmp(V->sbuffer, "LOCUS", 5) != 0) getline2(V); if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL) { SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); } in_definition = FALSE; while (! feof(V->f)) { getline2(V); if (! feof(V->f) && strstr(V->sbuffer, "DEFINITION") == V->sbuffer) { if ((sptr = strtok(V->sbuffer+12, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); in_definition = TRUE; } else if (! feof(V->f) && strstr(V->sbuffer, "ACCESSION") == V->sbuffer) { if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); in_definition = FALSE; } else if (strncmp(V->sbuffer,"ORIGIN", 6) != 0) { if (in_definition) SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC); } else break; } readLoop(0, endGB, V); /* reading a real GenBank database file, we keep the source coords */ V->sqinfo->start = 1; V->sqinfo->stop = V->seqlen; V->sqinfo->olen = V->seqlen; V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; while (!(feof(V->f) || ((*V->sbuffer!=0) && (strstr(V->sbuffer,"LOCUS") == V->sbuffer)))) getline2(V); /* SRE: V->s now holds "//", so sequential reads are wedged: fixed Tue Jul 13 1993 */ while (!feof(V->f) && strstr(V->sbuffer, "LOCUS ") != V->sbuffer) getline2(V); }
static void readPearson(struct ReadSeqVars *V) { char *sptr; if ((sptr = strtok(V->sbuffer+1, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); if ((sptr = strtok(NULL, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); /* workaround for long NCBI NR lines */ while (V->longline && ! feof(V->f)) getline2(V); readLoop(0, endPearson, V); while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>')))) getline2(V); }
static void readZuker(struct ReadSeqVars *V) { char *sptr; getline2(V); /*s == "seqLen seqid string..."*/ if ((sptr = strtok(V->sbuffer+6, " \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); if ((sptr = strtok(NULL, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); readLoop(0, endZuker, V); while (!(feof(V->f) | ((*V->sbuffer != '\0') & (*V->sbuffer == '(')))) getline2(V); }
static void readEMBL(struct ReadSeqVars *V) { char *sptr; /* make sure we have first line */ while (!feof(V->f) && strncmp(V->sbuffer, "ID ", 4) != 0) getline2(V); if ((sptr = strtok(V->sbuffer+5, "\n\t ")) != NULL) { SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); } do { getline2(V); if (!feof(V->f) && strstr(V->sbuffer, "AC ") == V->sbuffer) { if ((sptr = strtok(V->sbuffer+5, "; \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); } else if (!feof(V->f) && strstr(V->sbuffer, "DE ") == V->sbuffer) { if ((sptr = strtok(V->sbuffer+5, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); } } while (! feof(V->f) && strncmp(V->sbuffer,"SQ",2) != 0); readLoop(0, endEMBL, V); /* reading a real EMBL database file, we keep the source coords */ V->sqinfo->start = 1; V->sqinfo->stop = V->seqlen; V->sqinfo->olen = V->seqlen; V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; /* load next record's ID line */ while (!feof(V->f) && strncmp(V->sbuffer, "ID ", 4) != 0) getline2(V); }
static void readGCGdata(struct ReadSeqVars *V) { int binary = FALSE; /* whether data are binary or not */ int blen; /* length of binary sequence */ /* first line contains ">>>>" followed by name */ if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->sbuffer, 2) == 0) { binary = TRUE; SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); blen = atoi(sqd_parse[2]); } else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->sbuffer, 1) == 0) SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); else Die("bogus GCGdata format? %s", V->sbuffer); /* second line contains free text description */ getline2(V); SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC); if (binary) { /* allocate for blen characters +3... (allow for 3 bytes of slop) */ if (blen >= V->maxseq) { V->maxseq = blen; if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL) Die("malloc failed"); } /* read (blen+3)/4 bytes from file */ if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4)) Die("fread failed"); V->seqlen = blen; /* convert binary code to seq */ GCGBinaryToSequence(V->seq, blen); } else readLoop(0, endGCGdata, V); while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>')))) getline2(V); }
static void readPIR(struct ReadSeqVars *V) { char *sptr; /* load first line of entry */ while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0) getline2(V); if (feof(V->f)) return; if ((sptr = strtok(V->sbuffer + 15, "\n\t ")) != NULL) { SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); } do { getline2(V); if (!feof(V->f) && strncmp(V->sbuffer, "TITLE", 5) == 0) SetSeqinfoString(V->sqinfo, V->sbuffer+15, SQINFO_DESC); else if (!feof(V->f) && strncmp(V->sbuffer, "ACCESSION", 9) == 0) { if ((sptr = strtok(V->sbuffer+15, " \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); } } while (! feof(V->f) && (strncmp(V->sbuffer,"SEQUENCE", 8) != 0)); getline2(V); /* skip next line, coords */ readLoop(0, endPIR, V); /* reading a real PIR-CODATA database file, we keep the source coords */ V->sqinfo->start = 1; V->sqinfo->stop = V->seqlen; V->sqinfo->olen = V->seqlen; V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; /* get next line */ while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0) getline2(V); }
static int parse_MSF(FILE *fp, AINFO *ainfo) { char buffer[LINEBUFLEN]; char *sptr; int nseq; /* Get first dividing line. MSF format specifies ints after MSF: and Check: * but we don't make sure of this */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_FORMAT; return 0; } } while (strstr(buffer, " MSF: ") == NULL || strstr(buffer, " Check: ") == NULL || strstr(buffer, " ..") == NULL); /* Get names, weights from header */ nseq = 0; /*CONSTCOND*/ while (1) { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_FORMAT; return 0; } if (is_blankline(buffer)) continue; if (strncmp(buffer, "//", 2) == 0) break; sptr = strtok(buffer, WHITESPACE); if (sptr == NULL || strcmp(sptr, "Name:") != 0 || strstr(sptr+5, "Weight:") != 0) { squid_errno = SQERR_FORMAT; return 0; } if ((sptr = strtok(NULL, WHITESPACE)) == NULL) {squid_errno=SQERR_FORMAT; return 0; } SetSeqinfoString(&(ainfo->sqinfo[nseq]), sptr, SQINFO_NAME); while (sptr != NULL && strcmp(sptr, "Weight:") != 0) sptr = strtok(NULL, WHITESPACE); if ((sptr = strtok(NULL, WHITESPACE)) == NULL) {squid_errno=SQERR_FORMAT; return 0; } if (! IsReal(sptr)) { squid_errno = SQERR_FORMAT; return 0; } ainfo->wgt[nseq] = atof(sptr); nseq++; } if (nseq != ainfo->nseq) { squid_errno = SQERR_FORMAT; return 0; } return 1; }
static void readIG(struct ReadSeqVars *V) { char *nm; /* position past ';' comments */ do { getline2(V); } while (! (feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer != ';')) )); if (!feof(V->f)) { if ((nm = strtok(V->sbuffer, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); readLoop(0, endIG, V); } while (!(feof(V->f) || ((*V->sbuffer != '\0') && (*V->sbuffer == ';')))) getline2(V); }
static void readUWGCG(struct ReadSeqVars *V) { char *si; char *sptr; int done; V->seqlen = 0; /*writeseq: " %s Length: %d (today) Check: %d ..\n" */ /*drop above or ".." from id*/ if ((si = strstr(V->sbuffer," Length: ")) != NULL) *si = 0; else if ((si = strstr(V->sbuffer,"..")) != NULL) *si = 0; if ((sptr = strtok(V->sbuffer, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); do { done = feof(V->f); getline2(V); if (! done) addseq(V->sbuffer, V); } while (!done); }
static void readStrider(struct ReadSeqVars *V) { char *nm; while ((!feof(V->f)) && (*V->sbuffer == ';')) { if (strncmp(V->sbuffer,"; DNA sequence", 14) == 0) { if ((nm = strtok(V->sbuffer+16, ",\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); } getline2(V); } if (! feof(V->f)) readLoop(1, endStrider, V); /* load next line */ while ((!feof(V->f)) && (*V->sbuffer != ';')) getline2(V); }
/* Function: ReadSELEX() * Date: SRE, Sun Jun 6 18:24:09 1999 [St. Louis] * * Purpose: Parse an alignment read from an open SELEX format * alignment file. (SELEX is a single alignment format). * Return the alignment, or NULL if we've already read the * alignment or there's no alignment data in the file. * * Limitations: SELEX is the only remaining multipass parser for * alignment files. It cannot read from gzip or from stdin. * It Die()'s here if you try. The reason for this * that SELEX allows space characters as gaps, so we don't * know the borders of an alignment block until we've seen * the whole block. I could rewrite to allow single-pass * parsing (by storing the whole block in memory) but * since SELEX is now legacy, why bother. * * Note that the interface is totally kludged: fastest * possible adaptation of old ReadSELEX() to the new * MSA interface. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no alignment data. */ MSA * ReadSELEX(MSAFILE *afp) { MSA *msa; /* RETURN: mult seq alignment */ FILE *fp; /* ptr to opened seqfile */ char **aseqs; /* aligned seqs */ int num = 0; /* number of seqs read */ char buffer[LINEBUFLEN]; /* input buffer for lines */ char bufcpy[LINEBUFLEN]; /* strtok'able copy of buffer */ struct block_struc { /** alignment data for a block: */ int lcol; /* furthest left aligned sym */ int rcol; /* furthest right aligned sym */ } *blocks = NULL; int blocknum; /* number of blocks in file */ char *nptr; /* ptr to start of name on line */ char *sptr; /* ptr into sequence on line */ int currnum; /* num. seqs in given block */ int currblock; /* index for blocks */ int i; /* loop counter */ int seqidx; /* counter for seqs */ int alen; /* length of alignment */ int warn_names; /* becomes TRUE if names don't match between blocks */ int headnum; /* seqidx in per-sequence header info */ int currlen; int count; int have_cs = 0; int have_rf = 0; AINFO base_ainfo, *ainfo; /* hack: used to be passed ptr to AINFO */ /* Convert from MSA interface to what old ReadSELEX() did: * - copy our open fp, rather than opening file * - verify that we're not reading a gzip or stdin */ if (feof(afp->f)) return NULL; if (afp->do_gzip || afp->do_stdin) Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file"); fp = afp->f; ainfo = &base_ainfo; /*************************************************** * First pass across file. * Count seqs, get names, determine column info * Determine what sorts of info are active in this file. ***************************************************/ InitAinfo(ainfo); /* get first line of the block * (non-comment, non-blank) */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_NODATA; return 0; } strcpy(bufcpy, buffer); if (*buffer == '#') { if (strncmp(buffer, "#=CS", 4) == 0) have_cs = 1; else if (strncmp(buffer, "#=RF", 4) == 0) have_rf = 1; } } while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || (strchr(commentsyms, *nptr) != NULL)); blocknum = 0; warn_names = FALSE; while (!feof(fp)) { /* allocate for info about this block. */ if (blocknum == 0) blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc)); else blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc)); blocks[blocknum].lcol = LINEBUFLEN+1; blocks[blocknum].rcol = -1; currnum = 0; while (nptr != NULL) /* becomes NULL when this block ends. */ { /* First block only: save names */ if (blocknum == 0) { if (currnum == 0) ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO)); else ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO)); ainfo->sqinfo[currnum].flags = 0; SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME); } else /* in each additional block: check names */ { if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0) warn_names = TRUE; } currnum++; /* check rcol, lcol */ if ((sptr = strtok(NULL, WHITESPACE)) != NULL) { /* is this the furthest left we've seen word 2 in this block? */ if (sptr - bufcpy < blocks[blocknum].lcol) blocks[blocknum].lcol = sptr - bufcpy; /* look for right side in buffer */ for (sptr = buffer + strlen(buffer) - 1; strchr(WHITESPACE, *sptr) != NULL; sptr --) /* do nothing */ ; if (sptr - buffer > blocks[blocknum].rcol) blocks[blocknum].rcol = sptr - buffer; } /* get the next line; blank line means end of block */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; } strcpy(bufcpy, buffer); if (strncmp(buffer, "#=SS", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS; else if (strncmp(buffer, "#=SA", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA; else if (strncmp(buffer, "#=CS", 4) == 0) have_cs = 1; else if (strncmp(buffer, "#=RF", 4) == 0) have_rf = 1; if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; } while (strchr(commentsyms, *nptr) != NULL); } /* check that number of sequences matches expected */ if (blocknum == 0) num = currnum; else if (currnum != num) Die("Parse error in ReadSELEX()"); blocknum++; /* get first line of next block * (non-comment, non-blank) */ do { if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; } strcpy(bufcpy, buffer); } while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || (strchr(commentsyms, *nptr) != NULL)); } /*************************************************** * Get ready for second pass: * figure out the length of the alignment * malloc space * rewind the file ***************************************************/ alen = 0; for (currblock = 0; currblock < blocknum; currblock++) alen += blocks[currblock].rcol - blocks[currblock].lcol + 1; rewind(fp); /* allocations. we can't use AllocateAlignment because of * the way we already used ainfo->sqinfo. */ aseqs = (char **) MallocOrDie (num * sizeof(char *)); if (have_cs) ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char)); if (have_rf) ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char)); for (i = 0; i < num; i++) { aseqs[i] = (char *) MallocOrDie ((alen+1) * sizeof(char)); if (ainfo->sqinfo[i].flags & SQINFO_SS) ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char)); if (ainfo->sqinfo[i].flags & SQINFO_SA) ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char)); } ainfo->alen = alen; ainfo->nseq = num; ainfo->wgt = (float *) MallocOrDie (sizeof(float) * num); FSet(ainfo->wgt, num, 1.0); /*************************************************** * Second pass across file. Parse header; assemble sequences ***************************************************/ /* We've now made a complete first pass over the file. We know how * many blocks it contains, we know the number of seqs in the first * block, and we know every block has the same number of blocks; * so we can be a bit more cavalier about error-checking as we * make the second pass. */ /* Look for header */ headnum = 0; for (;;) { if (fgets(buffer, LINEBUFLEN, fp) == NULL) Die("Parse error in ReadSELEX()"); strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */ if (strcmp(nptr, "#=AU") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->au = Strdup(sptr); else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->name = Strdup(sptr); else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->acc = Strdup(sptr); else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL) ainfo->desc = Strdup(sptr); else if (strcmp(nptr, "#=GA") == 0) { if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=GA line in ReadSELEX()"); ainfo->ga1 = atof(sptr); if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=GA line in ReadSELEX()"); ainfo->ga2 = atof(sptr); ainfo->flags |= AINFO_GA; } else if (strcmp(nptr, "#=TC") == 0) { if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=TC line in ReadSELEX()"); ainfo->tc1 = atof(sptr); if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=TC line in ReadSELEX()"); ainfo->tc2 = atof(sptr); ainfo->flags |= AINFO_TC; } else if (strcmp(nptr, "#=NC") == 0) { if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=NC line in ReadSELEX()"); ainfo->nc1 = atof(sptr); if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=NC line in ReadSELEX()"); ainfo->nc2 = atof(sptr); ainfo->flags |= AINFO_NC; } else if (strcmp(nptr, "#=SQ") == 0) /* per-sequence header info */ { /* first field is the name */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX()"); if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE; /* second field is the weight */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX()"); if (!IsReal(sptr)) Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number"); ainfo->wgt[headnum] = atof(sptr); /* third field is database source id */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID); /* fourth field is database accession number */ if ((sptr = strtok(NULL, WHITESPACE)) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC); /* fifth field is start..stop::olen */ if ((sptr = strtok(NULL, ".:")) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START); if ((sptr = strtok(NULL, ".:")) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP); if ((sptr = strtok(NULL, ":\t ")) == NULL) Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN); /* rest of line is optional description */ if ((sptr = strtok(NULL, "\n")) != NULL) SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC); headnum++; } else if (strcmp(nptr, "#=CS") == 0) break; else if (strcmp(nptr, "#=RF") == 0) break; else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */ } currlen = 0; for (currblock = 0 ; currblock < blocknum; currblock++) { /* parse the block */ seqidx = 0; while (nptr != NULL) { /* Consensus structure */ if (strcmp(nptr, "#=CS") == 0) { if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=CS line in ReadSELEX()"); } /* Reference coordinates */ else if (strcmp(nptr, "#=RF") == 0) { if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=RF line in ReadSELEX()"); } /* Individual secondary structure */ else if (strcmp(nptr, "#=SS") == 0) { if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=SS line in ReadSELEX()"); } /* Side chain % surface accessibility code */ else if (strcmp(nptr, "#=SA") == 0) { if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=SA line in ReadSELEX()"); } /* Aligned sequence; avoid unparsed machine comments */ else if (strncmp(nptr, "#=", 2) != 0) { if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in alignment line in ReadSELEX()"); seqidx++; } /* get next line */ for (;;) { nptr = NULL; if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* EOF */ strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */ if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ if (strchr(commentsyms, *nptr) == NULL) break; /* data */ } } /* end of a block */ currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1; /* get line 1 of next block */ for (;;) { if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */ strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */ if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */ } } /* end of the file */ /* Lengths in sqinfo are for raw sequence (ungapped), * and SS, SA are 0..rlen-1 not 0..alen-1. * Only the seqs with structures come out of here with lengths set. */ for (seqidx = 0; seqidx < num; seqidx++) { int apos, rpos; /* secondary structures */ if (ainfo->sqinfo[seqidx].flags & SQINFO_SS) { for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseqs[seqidx][apos])) { ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos]; rpos++; } ainfo->sqinfo[seqidx].ss[rpos] = '\0'; } /* Surface accessibility */ if (ainfo->sqinfo[seqidx].flags & SQINFO_SA) { for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseqs[seqidx][apos])) { ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos]; rpos++; } ainfo->sqinfo[seqidx].sa[rpos] = '\0'; } } /* NULL-terminate all the strings */ if (ainfo->rf != NULL) ainfo->rf[alen] = '\0'; if (ainfo->cs != NULL) ainfo->cs[alen] = '\0'; for (seqidx = 0; seqidx < num; seqidx++) aseqs[seqidx][alen] = '\0'; /* find raw sequence lengths for sqinfo */ for (seqidx = 0; seqidx < num; seqidx++) { count = 0; for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++) if (!isgap(*sptr)) count++; ainfo->sqinfo[seqidx].len = count; ainfo->sqinfo[seqidx].flags |= SQINFO_LEN; } /*************************************************** * Garbage collection and return ***************************************************/ free(blocks); if (warn_names) Warn("sequences may be in different orders in blocks of %s?", afp->fname); /* Convert back to MSA structure. (Wasteful kludge.) */ msa = MSAFromAINFO(aseqs, ainfo); MSAVerifyParse(msa); FreeAlignment(aseqs, ainfo); return msa; }
static void readSquid(struct ReadSeqVars *V) { char *sptr; int dostruc = FALSE; while (strncmp(V->sbuffer, "NAM ", 4) != 0) getline2(V); if ((sptr = strtok(V->sbuffer+4, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); /*CONSTCOND*/ while (1) { getline2(V); if (feof(V->f)) {squid_errno = SQERR_FORMAT; return; } if (strncmp(V->sbuffer, "SRC ", 4) == 0) { if ((sptr = strtok(V->sbuffer+4, " \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); if ((sptr = strtok(NULL, " \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); if ((sptr = strtok(NULL, ".")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_START); if ((sptr = strtok(NULL, ".:")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_STOP); if ((sptr = strtok(NULL, ": \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_OLEN); } else if (strncmp(V->sbuffer, "DES ", 4) == 0) { if ((sptr = strtok(V->sbuffer+4, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); } else if (strncmp(V->sbuffer,"SEQ", 3) == 0) break; } if (strstr(V->sbuffer, "+SS") != NULL) dostruc = TRUE; V->seqlen = 0; /*CONSTCOND*/ while (1) { /* sequence line */ getline2(V); if (feof(V->f) || strncmp(V->sbuffer, "++", 2) == 0) break; addseq(V->sbuffer, V); /* structure line */ if (dostruc) { getline2(V); if (feof(V->f)) { squid_errno = SQERR_FORMAT; return; } addstruc(V->sbuffer, V); } } while (!feof(V->f) && strncmp(V->sbuffer, "NAM ", 4) != 0) getline2(V); }