/* @funcstatic seqwords_keysearch ******************************************** ** ** Search swissprot with terms structure and writes a hitlist structure ** ** @param [r] inf [AjPFile] File pointer to swissprot database ** @param [r] terms [AjPTerms] Terms object pointer ** @param [w] hits [EmbPHitlist*] Hitlist object pointer ** ** @return [AjBool] True on success ** @@ ******************************************************************************/ static AjBool seqwords_keysearch(AjPFile inf, AjPTerms terms, EmbPHitlist *hits) { AjPStr line =NULL; /* Line of text. */ AjPStr id =NULL; /* Line of text. */ AjPStr temp =NULL; ajint s =0; /* Temp. start of hit value. */ ajint e =0; /* Temp. end of hit value. */ AjPInt start =NULL; /* Array of start of hit(s). */ AjPInt end =NULL; /* Array of end of hit(s). */ ajint nhits =0; /* Number of hits. */ ajint x =0; AjBool foundkw =ajFalse; AjBool foundft =ajFalse; /* Check for valid args. */ if(!inf) return ajFalse; /* Allocate strings and arrays. */ line = ajStrNew(); id = ajStrNew(); start = ajIntNew(); end = ajIntNew(); /* Start of main loop. */ while((ajReadlineTrim(inf,&line))) { /* Parse the AC line. */ if(ajStrPrefixC(line,"AC")) { /* Copy accesion number and remove the ';' from the end. */ ajFmtScanS(line, "%*s %S", &id); ajStrExchangeCC(&id, ";", "\0"); /* Reset flags & no. hits. */ foundkw=ajFalse; foundft=ajFalse; nhits=0; } /* Search the description and keyword lines with search terms. */ else if((ajStrPrefixC(line,"DE") || (ajStrPrefixC(line,"KW")))) { /* ** Search terms have a leading and trailing space to prevent ** them being found as substrings within other words. To ** catch cases where a DE or KW line begins with a search ** term, we must add a leading and trailing space to line. ** We must first remove punctation from the line to be parsed. */ ajStrExchangeSetCC(&line, ".,;:", " "); ajStrAppendK(&line, ' '); ajStrInsertC(&line, 0, " "); for (x = 0; x < terms->N; x++) /* Search term is found. */ if((ajStrFindCaseS(line, terms->Keywords[x])!=-1)) { foundkw=ajTrue; break; } } /* Search the feature table line with search terms. */ else if((ajStrPrefixC(line,"FT DOMAIN"))) { /* ** Search terms have a leading and trailing space to prevent ** them being found as substrings within other words. To ** catch cases where a FT line ends with a search ** term, we must add a trailing space to line ** We must first remove punctation from the line to be parsed. */ ajStrExchangeSetCC(&line, ".,;:", " "); ajStrAppendK(&line, ' '); for (x = 0; x < terms->N; x++) if((ajStrFindCaseS(line, terms->Keywords[x])!=-1)) { /* Search term is found. */ foundft = ajTrue; nhits++; /* Assign start and end of hit. */ ajFmtScanS(line, "%*s %*s %d %d", &s, &e); ajIntPut(&start, nhits-1, s); ajIntPut(&end, nhits-1, e); break; } } /* Parse the sequence. */ else if((ajStrPrefixC(line,"SQ") && ((foundkw == ajTrue) || (foundft == ajTrue)))) { /* Allocate memory for temp. sequence. */ temp = ajStrNew(); /* Read the sequence into hitlist structure. */ while((ajReadlineTrim(inf,&line)) && !ajStrPrefixC(line,"//")) /* Read sequence line into temp. */ ajStrAppendC(&temp,ajStrGetPtr(line)+3); /* Clean up temp. sequence. */ ajStrRemoveWhite(&temp); /*Priority is given to domain (rather than full length) sequence.*/ if(foundft) { for(x=0;x<nhits;x++) { /* Increment counter of hits for subsequent hits*/ (*hits)->N++; /* Reallocate memory for array of hits in hitlist structure. */ AJCRESIZE((*hits)->hits, (*hits)->N); (*hits)->hits[(*hits)->N-1]=embHitNew(); ajStrAssignC(&(*hits)->hits[(*hits)->N-1]->Model, "KEYWORD"); /* Assign start and end of hit. */ (*hits)->hits[(*hits)->N-1]->Start = ajIntGet(start, x); (*hits)->hits[(*hits)->N-1]->End = ajIntGet(end, x); /* Extract sequence within specified range */ ajStrAssignSubS(&(*hits)->hits[(*hits)->N - 1]->Seq, temp, (*hits)->hits[(*hits)->N - 1]->Start - 1, (*hits)->hits[(*hits)->N - 1]->End - 1); /* Put id into structure */ ajStrAssignRef(&(*hits)->hits[(*hits)->N - 1]->Acc, id); } } else { /* Increment counter of hits */ (*hits)->N++; /* Reallocate memory for array of hits in hitlist structure */ AJCRESIZE((*hits)->hits, (*hits)->N); (*hits)->hits[(*hits)->N-1]=embHitNew(); ajStrAssignC(&(*hits)->hits[(*hits)->N-1]->Model, "KEYWORD"); /* Extract whole sequence */ ajStrAssignRef(&(*hits)->hits[(*hits)->N - 1]->Seq, temp); (*hits)->hits[(*hits)->N - 1]->Start = 1; (*hits)->hits[(*hits)->N - 1]->End = ajStrGetLen((*hits)->hits[(*hits)->N - 1]->Seq); /* Put id into structure */ ajStrAssignRef(&(*hits)->hits[(*hits)->N - 1]->Acc, id); } /* Free temp. sequence */ ajStrDel(&temp); } } /* Clean up */ ajStrDel(&line); ajStrDel(&id); ajIntDel(&start); ajIntDel(&end); return ajTrue; }
static void domainalign_ProcessStampFile(AjPStr in, AjPStr out, AjPDomain domain, ajint noden, AjPFile logf) { AjPFile outf = NULL; /* Output file pointer. */ AjPFile inf = NULL; /* Input file pointer. */ AjPStr temp1 = NULL; /* Temporary string. */ AjPStr temp2 = NULL; /* Temporary string. */ AjPStr temp3 = NULL; /* Temporary string. */ AjPStr line = NULL; /* Line of text from input file. */ ajint blk = 1; /* Count of the current block in the input file. Block 1 is the numbering and protein sequences, Block 2 is the secondary structure, Block 3 is the Very/Less/Post similar records*/ AjBool ok = ajFalse; /* Initialise strings. */ line = ajStrNew(); temp1 = ajStrNew(); temp2 = ajStrNew(); temp3 = ajStrNew(); /* Open input and output files. */ if(!(inf=ajFileNewInNameS(in))) ajFatal("Could not open input file in domainalign_ProcessStampFile"); /* Start of code for reading input file. Ignore everything up to first line beginning with 'Number'. */ while((ajReadlineTrim(inf,&line))) { /* ajFileReadLine will trim the tailing \n. */ if((ajStrGetCharPos(line, 1)=='\0')) { ok = ajTrue; break; } } /* Read rest of input file. */ if(ok) { /* Write DOMAIN classification records to file. */ if(!(outf=ajFileNewOutNameS(out))) ajFatal("Could not open output file in domainalign_ProcessStampFile"); if((domain->Type == ajSCOP)) { ajFmtPrintF(outf,"# TY SCOP\n# XX\n"); ajFmtPrintF(outf,"# CL %S",domain->Scop->Class); ajFmtPrintSplit(outf,domain->Scop->Fold,"\n# XX\n# FO ", 75," \t\n\r"); ajFmtPrintSplit(outf,domain->Scop->Superfamily,"# XX\n# SF ", 75," \t\n\r"); ajFmtPrintSplit(outf,domain->Scop->Family,"# XX\n# FA ", 75," \t\n\r"); ajFmtPrintF(outf,"# XX\n"); } else { ajFmtPrintF(outf,"# TY CATH\n# XX\n"); ajFmtPrintF(outf,"# CL %S",domain->Cath->Class); ajFmtPrintSplit(outf,domain->Cath->Architecture,"\n# XX\n# AR ", 75," \t\n\r"); ajFmtPrintSplit(outf,domain->Cath->Topology,"# XX\n# TP ",75, " \t\n\r"); ajFmtPrintSplit(outf,domain->Cath->Superfamily,"# XX\n# SF ",75, " \t\n\r"); ajFmtPrintF(outf,"# XX\n"); } if((domain->Type == ajSCOP)) { if(noden==1) ajFmtPrintF(outf,"# SI %d\n# XX",domain->Scop->Sunid_Class); else if(noden==2) ajFmtPrintF(outf,"# SI %d\n# XX",domain->Scop->Sunid_Fold); else if(noden==3) ajFmtPrintF(outf,"# SI %d\n# XX", domain->Scop->Sunid_Superfamily); else if(noden==4) ajFmtPrintF(outf,"# SI %d\n# XX", domain->Scop->Sunid_Family); else ajFatal("Node number error in domainalign_ProcessStampFile"); } else { if(noden==5) ajFmtPrintF(outf,"# SI %d\n# XX", domain->Cath->Class_Id); else if(noden==6) ajFmtPrintF(outf,"# SI %d\n# XX", domain->Cath->Arch_Id); else if(noden==7) ajFmtPrintF(outf,"# SI %d\n# XX",domain->Cath->Topology_Id); else if(noden==8) ajFmtPrintF(outf,"# SI %d\n# XX", domain->Cath->Superfamily_Id); else if(noden==9) ajFmtPrintF(outf,"# SI %d\n# XX",domain->Cath->Family_Id); else ajFatal("Node number error in domainalign_ProcessStampFile"); } while((ajReadlineTrim(inf,&line))) { /* Increment counter for block of file. */ if((ajStrGetCharPos(line, 1)=='\0')) { blk++; if(blk==4) blk=1; continue; } /* Block of numbering line and protein sequences. */ if(blk==1) { /* Print the number line out as it is. */ if(ajStrPrefixC(line,"Number")) ajFmtPrintF(outf,"\n# %7s %S\n"," ", line); else { /* Read only the 7 characters of the domain identifier code in. */ ajFmtScanS(line, "%S", &temp1); ajStrAssignSubS(&temp2, temp1, 0, 6); /* Read the sequence. */ ajStrAssignSubS(&temp3, line, 13, 69); ajStrExchangeSetCC(&temp3, " ", "X"); ajFmtPrintF(logf, "Replaced ' ' in STAMP alignment " "with 'X'\n"); ajStrFmtUpper(&temp3); /* Write domain id code and sequence out. */ ajFmtPrintF(outf,"%-15S%7d %S%7d\n", temp2, 0, temp3, 0); } } /* Secondary structure filled with '????' (unwanted). */ else if(blk==2) { continue; } /* Similarity lines. */ else { if(ajStrPrefixC(line,"Post")) { /* Read the sequence. */ ajStrAssignSubS(&temp3, line, 13, 69); /* Write post similar line out. */ ajFmtPrintF(outf,"%-15s%7s %S\n","# Post_similar", " ", temp3); } /* Ignore Very and Less similar lines. */ else continue; } } } else /* ok == ajFalse. */ { ajWarn("\n***********************************************\n" "* STAMP was called but output file was EMPTY! *\n" "* NO OUTPUT FILE GENERATED FOR THIS NODE. *\n" "***********************************************\n"); ajFmtPrintF(logf, "STAMP called but output file empty. " "No output file for this node!"); } /* Clean up and close input and output files. */ ajFileClose(&outf); ajFileClose(&inf); ajStrDel(&line); ajStrDel(&temp1); ajStrDel(&temp2); ajStrDel(&temp3); /* All done. */ return; }