int read_dssp (Options * options, Protein *protein) { char line[LONGSTRING] = {'\0'}; char pdb_id[PDB_ATOM_RES_NO_LEN+2] = {'\0'}; char tmp[5]; int resctr, acc, total = 0; FILE *fptr; fptr = efopen(options->dssp_file_name,"r"); while( fgets( line, LONGSTRING, fptr) != NULL){ if( ! strncmp(line+5,"RESIDUE", 7)){ break; } } while( fgets( line, LONGSTRING, fptr) != NULL){ if ( strchr ( line, '!') ) continue; strncpy ( pdb_id, line+6, 5); string_clean (pdb_id, 5); if ( ! pdb_id[0] ) continue; strncpy ( tmp, line+34, 4); tmp[4] = '\0'; acc = atoi(tmp); if ( acc > options->acc_cutoff ) { for ( resctr=0; resctr < protein->length; resctr++ ) { if ( !strcmp (pdb_id, protein->sequence[resctr].pdb_id) ) { protein->sequence[resctr].solvent_accessible = 1; total++; } } } } fclose (fptr); if ( ! total ) { fprintf (stderr, "No residue solvent accessible (?!)\n"); return 1; } return 0; }
int read_pdb ( char * pdbname, Protein * protein, char chain) { Residue * sequence; FILE * fptr = NULL; char line[BUFFLEN]; char oldresno[PDB_ATOM_RES_NO_LEN+2]; /* res name: 4 digits + insertion code + \0 */ char oldrestype [PDB_ATOM_RES_NAME_LEN+2]; char tmp[PDB_ATOM_X_LEN+1], *auxptr; int atomctr, resctr, no_res, ctr, nonblank; char single_letter ( char code[]); /* open file */ fptr = fopen ( pdbname, "r"); if ( !fptr ) { fprintf (stderr, "Cno %s.\n", pdbname); return 1; } /* warn if no chain given */ if ( !chain) fprintf ( stderr,"No chain specified. Using the first one.\n"); /* count residues */ memset (line, 0, BUFFLEN); memset (oldresno, 0, PDB_ATOM_RES_NO_LEN+2); resctr = 0; while(fgets(line, BUFFLEN, fptr)!=NULL){ if ( chain && line[PDB_ATOM_CHAINID] != chain ) continue; if ( ! strncmp(line,"TER", 3) || ! strncmp(line,"END", 3) ) break; if( ! strncmp(line,"ATOM", 4) || ! strncmp(line,"HETATM", 6)){ if ( strncmp (line+PDB_ATOM_RES_NO, oldresno, PDB_ATOM_RES_NO_LEN+1) ) { strncpy (oldresno, line+PDB_ATOM_RES_NO, PDB_ATOM_RES_NO_LEN+1); oldresno[PDB_ATOM_RES_NO_LEN+1] = '\0'; /* printf ( "New residue number: %s \n", oldresno); */ resctr ++; } } } no_res = resctr; /* printf ("no residues: %d\n", no_res); */ /* allocate space */ sequence = NULL; sequence = emalloc ( no_res*sizeof (Residue)); /* read in the atom */ rewind ( fptr); memset (line, 0, BUFFLEN); memset (oldresno, 0, PDB_ATOM_RES_NO_LEN+2); /* tyring to account for the insertion code */ memset (oldrestype, 0, PDB_ATOM_RES_NAME_LEN+2); resctr= -1; atomctr = 0; while(fgets(line, BUFFLEN, fptr)!=NULL){ if ( chain && line[PDB_ATOM_CHAINID] != chain ) continue; if ( ! strncmp(line,"TER", 3) || ! strncmp(line,"END", 3) ) break; if( ! strncmp(line,"ATOM", 4) || ! strncmp(line,"HETATM", 6)){ /* if it's a hydrogen - skip */ if ( line[PDB_ATOM_ATOM_NAME] == 'H' || line[PDB_ATOM_ATOM_NAME+1] == 'H') continue; /* adjust the counters */ if ( strncmp (line+PDB_ATOM_RES_NO, oldresno, PDB_ATOM_RES_NO_LEN+1) ) { strncpy (oldresno, line+PDB_ATOM_RES_NO, PDB_ATOM_RES_NO_LEN+1); strncpy (oldrestype, line+PDB_ATOM_RES_NAME, PDB_ATOM_RES_NAME_LEN); oldresno[PDB_ATOM_RES_NO_LEN+1] = '\0'; oldrestype[PDB_ATOM_RES_NAME_LEN] = '\0'; resctr ++; atomctr = 0; sequence[resctr].no_atoms = 1; strncpy ( sequence[resctr].pdb_id, oldresno, PDB_ATOM_RES_NO_LEN+2); sequence[resctr].pdb_id[PDB_ATOM_RES_NO_LEN+1] = '\0'; strncpy ( sequence[resctr].res_type, oldrestype, PDB_ATOM_RES_NAME_LEN+1); sequence[resctr].res_type[PDB_ATOM_RES_NAME_LEN] = '\0'; sequence[resctr].res_type_short = single_letter ( sequence[resctr].res_type ); if ( !sequence[resctr].res_type_short ) return 1; } else { atomctr ++; sequence[resctr].no_atoms = atomctr + 1; if ( atomctr >= MAX_NO_ATOMS ) { fprintf ( stderr, "Error: I thought every aa has < %d atoms.\n", MAX_NO_ATOMS ); return 1; } } /* read in atom info */ auxptr = line+ PDB_ATOM_ATOM_NAME; memset ( tmp, 0, PDB_ATOM_ATOM_NAME_LEN+1); /* skip initial blanks*/ ctr = 0; while ( !(isalpha (*(auxptr + ctr))) && (ctr <= PDB_ATOM_ATOM_NAME_LEN) ) ctr++; /* copy alphanum info */ nonblank = 0; while ( isalpha (*(auxptr +ctr)) && (ctr <= PDB_ATOM_ATOM_NAME_LEN) ) { tmp[nonblank] = *(auxptr +ctr); nonblank ++; ctr++; } strncpy ( sequence[resctr].atom[atomctr].type, tmp, PDB_ATOM_ATOM_NAME_LEN ); /* is this a backbone atom?*/ sequence[resctr].atom[atomctr].backbone = 0; if ( nonblank == 1) { sequence[resctr].atom[atomctr].backbone = !( strcmp ( tmp, "N") && strcmp ( tmp, "C") && strcmp ( tmp, "O") ); } else if ( nonblank == 2) { sequence[resctr].atom[atomctr].backbone = ! strcmp ( tmp, "CA" ); } /* printf ( " %4d %4d %4s is backbone: %1d \n", resctr, atomctr, */ /* sequence[resctr].atom[atomctr].type, sequence[resctr].atom[atomctr].backbone); */ strncpy ( tmp, line+PDB_ATOM_X, PDB_ATOM_X_LEN); tmp[PDB_ATOM_X_LEN] = '\0'; sequence[resctr].atom[atomctr].x=atof(tmp); strncpy ( tmp, line+PDB_ATOM_Y, PDB_ATOM_Y_LEN); tmp[PDB_ATOM_Y_LEN] = '\0'; sequence[resctr].atom[atomctr].y=atof(tmp); strncpy ( tmp, line+PDB_ATOM_Z, PDB_ATOM_Z_LEN); tmp[PDB_ATOM_Z_LEN] = '\0'; sequence[resctr].atom[atomctr].z=atof(tmp); } } /* close file */ fclose (fptr); /* clean PDB id tags from spaces */ for (resctr=0; resctr < no_res; resctr ++ ) { string_clean (sequence[resctr].pdb_id, PDB_ATOM_RES_NO_LEN+2); } /*return values: */ protein->sequence= sequence; protein->length = no_res; return 0; }
/* MAIN() ------ */ int main(int argc, char *argv[]) { static char *seperators = " "; char *file, *token, *where_to, *filename; // *start; char **term_list, **first, **last, **current; ANT_link_extract_term *link_index, *index_term; long terms_in_index, current_docid, param, file_number; long lowercase_only, first_param; long is_utf8_token, cmp, is_substring = FALSE; // token_len char *command; ANT_directory_iterator_object file_object; char buffer[1024 * 1024]; if (argc < 3) exit(printf("Usage:%s [-chinese] [-lowercase] <index> <file_to_link> ...\n", argv[0])); first_param = 1; lowercase_only = FALSE; chinese = FALSE; for (param = 1; param < argc; param++) { if (*argv[param] == '-') { command = argv[param] + 1; if (strcmp(command, "lowercase") == 0) { lowercase_only = TRUE; ++first_param; } else if (strcmp(command, "chinese") == 0) { chinese = TRUE; ++first_param; } else exit(printf("Unknown parameter:%s\n", argv[param])); } } link_index = read_index(argv[first_param], &terms_in_index); file_number = 1; for (param = first_param + 1; param < argc; param++) { ANT_directory_iterator_recursive disk(argv[param]); // make the recursive pattern matching as for default files reading if (disk.first(&file_object) == NULL) file = filename = NULL; else { filename = file_object.filename; file = ANT_disk::read_entire_file(filename); } while (file != NULL) { current_docid = get_doc_id(file); if (current_docid > 0) { // printf("ID:%d\n", current_docid); string_clean(file, lowercase_only, TRUE); current = term_list = new char *[strlen(file)]; // this is the worst case by far if (chinese) create_utf8_token_list(file, term_list); else { for (token = strtok(file, seperators); token != NULL; token = strtok(NULL, seperators)) *current++ = token; *current = NULL; } for (first = term_list; *first != NULL; first++) { // fprintf(stderr, "%s\n", *first); where_to = buffer; for (last = first; *last != NULL; last++) { if (where_to == buffer) { strcpy(buffer, *first); where_to = buffer + strlen(buffer); if (chinese) { if ((*first[0] & 0x80) && isutf8(*first)) is_utf8_token = TRUE; else is_utf8_token = FALSE; } } else { if (!chinese) *where_to++ = ' '; strcpy(where_to, *last); where_to += strlen(*last); } *where_to = '\0'; index_term = find_term_in_list(buffer, link_index, terms_in_index); if (index_term == NULL) break; // we're after the last term in the list so can stop because we can't be a substring if (chinese) { is_substring = FALSE; cmp = utf8_token_compare(buffer, index_term->term, &is_substring); } else cmp = string_compare(buffer, index_term->term); if (cmp == 0) // we're a term in the list { index_term->total_occurences++; if (index_term->last_docid != current_docid) { index_term->last_docid = current_docid; index_term->docs_containing_term++; } } else { if (chinese) cmp = is_substring == TRUE ? 0 : 1; else cmp = memcmp(buffer, index_term->term, strlen(buffer)); if (cmp != 0) break; // we're a not a substring so we can't find a longer term } } } if (chinese) free_utf8_token_list(term_list); delete [] term_list; delete [] file; if (file_number % 1000 == 0) fprintf(stderr, "Files processed:%d\n", file_number); file_number++; } else fprintf(stderr, "Error reading file %s\n", filename); //filename = disk.get_next_filename(); if (disk.next(&file_object) == NULL) file = filename = NULL; else { filename = file_object.filename; file = ANT_disk::read_entire_file(filename); } } } print_answer(link_index, terms_in_index); fprintf(stderr, "%s Completed\n", argv[0]); return 0; }
int fill_protein_info ( FILE * fptr, char chain, Protein * protein) { /* TODO for the moment we rely on PDB annotation to extract structural elements - that should be changed */ Residue * sequence = NULL; char line[BUFFLEN]; char oldresno[PDB_ATOM_RES_NO_LEN+2]; /* res name: 4 digits + insertion code + \0 */ char oldrestype [PDB_ATOM_RES_NAME_LEN+2]; char tmp[BUFFLEN], *auxptr; char atomtypes_read_in[BUFFLEN]; char old_chain; int atomctr, resctr, no_res,ctr, nonblank; int retval; int chain_found; int ca_trace; char single_letter ( char code[]); int has_backbone (Residue * sequence, int from, int to); /********************************************/ /********************************************/ /* cleanup */ memset (protein, 0, sizeof(Protein) ); /********************************************/ /********************************************/ /* count residues */ memset (line, 0, BUFFLEN); memset (oldresno, 0, PDB_ATOM_RES_NO_LEN+2); resctr = 0; chain_found = 0; old_chain = '\0'; while(fgets(line, BUFFLEN, fptr)!=NULL){ if (resctr) { if ( ! strncmp(line,"END", 3) || (chain && line[PDB_ATOM_CHAINID] != old_chain) ) break; } if (chain && line[PDB_ATOM_CHAINID] != chain) continue; chain_found = 1; if( ! strncmp(line,"ATOM", 4)){ if ( strncmp (line+PDB_ATOM_RES_NO, oldresno, PDB_ATOM_RES_NO_LEN+1) ) { strncpy (oldresno, line+PDB_ATOM_RES_NO, PDB_ATOM_RES_NO_LEN+1); oldresno[PDB_ATOM_RES_NO_LEN+1] = '\0'; /* handling the case when the chain is not given, meaning: "take the first chain" */ old_chain = line[PDB_ATOM_CHAINID]; resctr ++; } } } /* sanity: */ if ( chain && ! chain_found) { fprintf (stderr, "Chain %c not found.\n", chain); return ERR_NO_FILE_OR_CHAIN; } no_res = resctr; if ( !no_res ) return -1; /* take it as the end of the read */ /* allocate space */ sequence = emalloc ( no_res*sizeof (Residue)); if ( ! sequence ) return 1; /*********************************************/ /*********************************************/ /* read in residue numbers and atom coords */ rewind ( fptr); memset (line, 0, BUFFLEN); old_chain = '\0'; memset (oldresno, 0, PDB_ATOM_RES_NO_LEN+2); memset (oldrestype, 0, PDB_ATOM_RES_NAME_LEN+2); resctr= -1; atomctr = 0; ca_trace = 1; while(fgets(line, BUFFLEN, fptr)!=NULL){ if ( resctr > -1) { if (! strncmp(line,"END", 3) || (chain && line[PDB_ATOM_CHAINID] != old_chain)) break; } if ( chain && line[PDB_ATOM_CHAINID] != chain ) continue; if( ! strncmp(line,"ATOM", 4) ){ /* if it's a hydrogen - skip */ if ( line[PDB_ATOM_ATOM_NAME] == 'H' || line[PDB_ATOM_ATOM_NAME+1] == 'H') continue; /* adjust the counters */ if ( strncmp (line+PDB_ATOM_RES_NO, oldresno, PDB_ATOM_RES_NO_LEN+1) ) { /*+1 in PDB_ATOM_RES_NO_LEN+1 means I am including the insertion code in the identifier */ strncpy (oldresno, line+PDB_ATOM_RES_NO, PDB_ATOM_RES_NO_LEN+1); strncpy (oldrestype, line+PDB_ATOM_RES_NAME, PDB_ATOM_RES_NAME_LEN); oldresno[PDB_ATOM_RES_NO_LEN+1] = '\0'; oldrestype[PDB_ATOM_RES_NAME_LEN] = '\0'; /* handling the case when the chain is not given, meaning: "take the first chain" */ old_chain = line[PDB_ATOM_CHAINID]; resctr ++; if ( resctr >= no_res ) { fprintf (stderr, "Error reading pdb: resctr:%d no res: %d\n", resctr, no_res); return ERR_NONSENSE; } atomctr = 0; /* keep track of atom types we have read in */ memset (atomtypes_read_in, 0, BUFFLEN*sizeof(char)); atomtypes_read_in[0] = '_'; sequence[resctr].no_atoms = 1; strncpy ( sequence[resctr].pdb_id, oldresno, PDB_ATOM_RES_NO_LEN+2); sequence[resctr].pdb_id[PDB_ATOM_RES_NO_LEN+1] = '\0'; strncpy ( sequence[resctr].res_type, oldrestype, PDB_ATOM_RES_NAME_LEN+1); sequence[resctr].res_type[PDB_ATOM_RES_NAME_LEN] = '\0'; sequence[resctr].res_type_short = single_letter ( sequence[resctr].res_type ); /* modified residues are ok for the purposes here */ /* unless they are sugars or some such - deal with it below */ /* by checking the backbone atoms */ ///if ( !sequence[resctr].res_type_short ) return 1; } else { atomctr ++; sequence[resctr].no_atoms = atomctr + 1; if ( atomctr >= MAX_NO_ATOMS ) { fprintf ( stderr, "Error parsing pdb: I thought every aa has < %d atoms.\n", MAX_NO_ATOMS ); return ERR_MAX_ATOMS; } } /* read in atom info */ auxptr = line+ PDB_ATOM_ATOM_NAME; memset ( tmp, 0, PDB_ATOM_ATOM_NAME_LEN+1); /* skip initial blanks*/ ctr = 0; while ( !(isalpha (*(auxptr + ctr))) && (ctr <= PDB_ATOM_ATOM_NAME_LEN) ) ctr++; /* copy alphanum info */ nonblank = 0; while ( isalnum (*(auxptr +ctr)) && (ctr <= PDB_ATOM_ATOM_NAME_LEN) ) { tmp[nonblank] = *(auxptr +ctr); nonblank ++; ctr++; } /* have we already seen this atom type by any chance? */ tmp[nonblank] = '_'; if ( strstr (atomtypes_read_in, tmp) ) { /* ahould I check for an alt location code, or just move on? */ //printf ( " %s >> %s //// %s\n", sequence[resctr].pdb_id, atomtypes_read_in, tmp); continue; } else { sprintf (atomtypes_read_in, "%s%s", atomtypes_read_in, tmp); } tmp[nonblank] = '\0'; strncpy ( sequence[resctr].atom[atomctr].type, tmp, PDB_ATOM_ATOM_NAME_LEN ); /* is this a backbone atom?*/ sequence[resctr].atom[atomctr].backbone = 0; if ( nonblank == 1) { sequence[resctr].atom[atomctr].backbone = !( strcmp ( tmp, "N") && strcmp ( tmp, "C") && strcmp ( tmp, "O") ); } else if ( nonblank == 2) { if ( ! strcmp ( tmp, "CA" )) { sequence[resctr].atom[atomctr].backbone = 1; sequence[resctr].Ca = sequence[resctr].atom+atomctr; } else { sequence[resctr].atom[atomctr].backbone = 0; } } /* check if this is Ca trace */ if ( strcmp ( tmp, "CA" ) ) ca_trace = 0; strncpy ( tmp, line+PDB_ATOM_X, PDB_ATOM_X_LEN); tmp[PDB_ATOM_X_LEN] = '\0'; sequence[resctr].atom[atomctr].x=atof(tmp); strncpy ( tmp, line+PDB_ATOM_Y, PDB_ATOM_Y_LEN); tmp[PDB_ATOM_Y_LEN] = '\0'; sequence[resctr].atom[atomctr].y=atof(tmp); strncpy ( tmp, line+PDB_ATOM_Z, PDB_ATOM_Z_LEN); tmp[PDB_ATOM_Z_LEN] = '\0'; sequence[resctr].atom[atomctr].z=atof(tmp); } } if ( ca_trace) return ERR_SSE_NONE|ERR_CA_TRACE; /* clean PDB id tags from spaces */ for (resctr=0; resctr < no_res; resctr ++ ) { retval = string_clean (sequence[resctr].pdb_id, PDB_ATOM_RES_NO_LEN+1); if ( retval ) { fprintf (stderr, "Error in read_pdb(): empty id string for residue with sequential no %d.\n", resctr); return ERR_NONSENSE; } } /* store the sequence and its length */ protein->sequence = sequence; protein->length = no_res; return 0; }