Example #1
0
int  read_dssp (Options * options, Protein *protein) {

    char line[LONGSTRING] = {'\0'};
    char pdb_id[PDB_ATOM_RES_NO_LEN+2] = {'\0'};
    char tmp[5];
    int resctr, acc, total = 0;
    FILE *fptr;
    
    fptr = efopen(options->dssp_file_name,"r");

    while( fgets( line, LONGSTRING, fptr) != NULL){
	if( ! strncmp(line+5,"RESIDUE", 7)){
	    break;
	}
    }
    while( fgets( line, LONGSTRING, fptr) != NULL){
	if ( strchr ( line, '!') ) continue;
	strncpy ( pdb_id, line+6, 5);
	string_clean (pdb_id, 5);
	if ( ! pdb_id[0] )  continue;
	strncpy ( tmp, line+34, 4);
	tmp[4] = '\0';
	acc = atoi(tmp);
	if ( acc > options->acc_cutoff ) {
	    for ( resctr=0; resctr < protein->length; resctr++ ) {
		if ( !strcmp (pdb_id, protein->sequence[resctr].pdb_id) ) {
		    protein->sequence[resctr].solvent_accessible = 1;
		    total++;
		}
	    }
	}
    }
    fclose (fptr);

    if ( ! total ) {
	fprintf (stderr, "No residue solvent accessible (?!)\n");
	return 1;
    }

    return 0;
}
Example #2
0
int read_pdb ( char * pdbname, Protein * protein, char chain) {

    Residue * sequence;
    FILE * fptr = NULL;
    char line[BUFFLEN];
    char oldresno[PDB_ATOM_RES_NO_LEN+2]; /* res name: 4 digits + insertion code + \0 */
    char oldrestype [PDB_ATOM_RES_NAME_LEN+2];
    char tmp[PDB_ATOM_X_LEN+1], *auxptr;
    int atomctr, resctr, no_res, ctr, nonblank;
    
    char single_letter ( char code[]);
    
    
    /* open file */
    fptr = fopen ( pdbname, "r");
    if ( !fptr ) {
	fprintf (stderr, "Cno %s.\n", pdbname);
	return 1;
    }
    /* warn if no chain given */
    if ( !chain)  fprintf ( stderr,"No chain specified. Using the first one.\n");

    /* count residues */
    memset (line,  0, BUFFLEN);
    memset (oldresno, 0, PDB_ATOM_RES_NO_LEN+2);
    resctr = 0;
    while(fgets(line, BUFFLEN, fptr)!=NULL){
	if ( chain && line[PDB_ATOM_CHAINID] != chain ) continue;
	if ( ! strncmp(line,"TER", 3) ||  ! strncmp(line,"END", 3) ) break;
	if( ! strncmp(line,"ATOM", 4) ||  ! strncmp(line,"HETATM", 6)){
	    if (  strncmp (line+PDB_ATOM_RES_NO, oldresno,  PDB_ATOM_RES_NO_LEN+1) ) {
		
		strncpy (oldresno, line+PDB_ATOM_RES_NO, PDB_ATOM_RES_NO_LEN+1);
		oldresno[PDB_ATOM_RES_NO_LEN+1] = '\0';
		/* printf ( "New residue number:  %s \n", oldresno); */
		resctr ++;
	    }
	}
    }
    no_res = resctr;
    /* printf ("no residues: %d\n", no_res); */

    /* allocate space */
    sequence = NULL;
    sequence = emalloc ( no_res*sizeof (Residue));

    /* read in the atom */
    rewind ( fptr);
    memset (line,  0, BUFFLEN);
    memset (oldresno, 0, PDB_ATOM_RES_NO_LEN+2);
    /*  tyring to account for the insertion code */
    memset (oldrestype, 0, PDB_ATOM_RES_NAME_LEN+2);
    resctr= -1;
    atomctr = 0;
    while(fgets(line, BUFFLEN, fptr)!=NULL){
	if ( chain && line[PDB_ATOM_CHAINID] != chain ) continue;
	if ( ! strncmp(line,"TER", 3) ||  ! strncmp(line,"END", 3) ) break;
	if( ! strncmp(line,"ATOM", 4)  ||  ! strncmp(line,"HETATM", 6)){
	   /* if it's a hydrogen - skip */
	    if ( line[PDB_ATOM_ATOM_NAME] == 'H'
		 ||  line[PDB_ATOM_ATOM_NAME+1] == 'H') continue;
	    /* adjust the counters */ 
	    if (  strncmp (line+PDB_ATOM_RES_NO, oldresno,  PDB_ATOM_RES_NO_LEN+1) ) {
		strncpy (oldresno, line+PDB_ATOM_RES_NO, PDB_ATOM_RES_NO_LEN+1);
		strncpy (oldrestype, line+PDB_ATOM_RES_NAME, PDB_ATOM_RES_NAME_LEN);
		oldresno[PDB_ATOM_RES_NO_LEN+1] = '\0';
		oldrestype[PDB_ATOM_RES_NAME_LEN] = '\0';
		resctr ++;
		atomctr = 0;
		
		sequence[resctr].no_atoms = 1;
		strncpy ( sequence[resctr].pdb_id, oldresno, PDB_ATOM_RES_NO_LEN+2);
		sequence[resctr].pdb_id[PDB_ATOM_RES_NO_LEN+1]   = '\0';

		strncpy ( sequence[resctr].res_type, oldrestype, PDB_ATOM_RES_NAME_LEN+1);
		sequence[resctr].res_type[PDB_ATOM_RES_NAME_LEN] = '\0';
		sequence[resctr].res_type_short  = single_letter ( sequence[resctr].res_type );
		if ( !sequence[resctr].res_type_short ) return 1;
	   
	    } else {
		atomctr ++;
		sequence[resctr].no_atoms = atomctr + 1;
		if ( atomctr >= MAX_NO_ATOMS ) {
		    fprintf ( stderr, "Error: I thought every aa has < %d atoms.\n",
			      MAX_NO_ATOMS );
		    return 1;
		}
	    }
	    /* read in atom info */
	    
	    auxptr = line+ PDB_ATOM_ATOM_NAME;
	    memset ( tmp, 0, PDB_ATOM_ATOM_NAME_LEN+1);
	    /* skip initial blanks*/
	    ctr  = 0;
	    while ( !(isalpha (*(auxptr + ctr))) &&  (ctr <= PDB_ATOM_ATOM_NAME_LEN) ) ctr++;
	    /* copy alphanum info */
	    nonblank = 0;
	    while (  isalpha (*(auxptr +ctr))  &&  (ctr <= PDB_ATOM_ATOM_NAME_LEN) ) {
		tmp[nonblank] =  *(auxptr +ctr);
		nonblank ++;
		ctr++;
	    }

	    strncpy ( sequence[resctr].atom[atomctr].type, tmp, PDB_ATOM_ATOM_NAME_LEN );

	    /* is this a backbone atom?*/
	    sequence[resctr].atom[atomctr].backbone = 0;
	    if ( nonblank == 1) {
		  sequence[resctr].atom[atomctr].backbone =
		      !(  strcmp ( tmp, "N") && strcmp ( tmp, "C") && strcmp ( tmp, "O")  );
	    } else if (  nonblank == 2) {
		  sequence[resctr].atom[atomctr].backbone = ! strcmp ( tmp, "CA" );
	    }
	    /* printf ( " %4d %4d %4s is backbone: %1d \n", resctr, atomctr, */
		     /* sequence[resctr].atom[atomctr].type, sequence[resctr].atom[atomctr].backbone); */
	    strncpy ( tmp, line+PDB_ATOM_X, PDB_ATOM_X_LEN);
	    tmp[PDB_ATOM_X_LEN] = '\0';
	    sequence[resctr].atom[atomctr].x=atof(tmp);
	    strncpy ( tmp, line+PDB_ATOM_Y, PDB_ATOM_Y_LEN);
	    tmp[PDB_ATOM_Y_LEN] = '\0';
	    sequence[resctr].atom[atomctr].y=atof(tmp);
	    strncpy ( tmp, line+PDB_ATOM_Z, PDB_ATOM_Z_LEN);
	    tmp[PDB_ATOM_Z_LEN] = '\0';
	    sequence[resctr].atom[atomctr].z=atof(tmp);
	   
	}
	
    }
 
    /* close file */
    fclose (fptr);

    /* clean PDB id tags from spaces */
    for (resctr=0; resctr < no_res; resctr ++ ) {
	string_clean (sequence[resctr].pdb_id, PDB_ATOM_RES_NO_LEN+2);
    }
    
    /*return values: */
    protein->sequence= sequence;
    protein->length  = no_res;

    return 0;
}
/*
	MAIN()
	------
*/
int main(int argc, char *argv[])
{
    static char *seperators = " ";
    char *file, *token, *where_to, *filename;			// *start;
    char **term_list, **first, **last, **current;
    ANT_link_extract_term *link_index, *index_term;
    long terms_in_index, current_docid, param, file_number;
    long lowercase_only, first_param;
    long is_utf8_token, cmp, is_substring = FALSE;				// token_len
    char *command;
    ANT_directory_iterator_object file_object;

    char buffer[1024 * 1024];

    if (argc < 3)
        exit(printf("Usage:%s [-chinese] [-lowercase] <index> <file_to_link> ...\n", argv[0]));

    first_param = 1;
    lowercase_only = FALSE;
    chinese = FALSE;

    for (param = 1; param < argc; param++)
    {
        if (*argv[param] == '-')
        {
            command = argv[param] + 1;
            if (strcmp(command, "lowercase") == 0)
            {
                lowercase_only = TRUE;
                ++first_param;
            }
            else if (strcmp(command, "chinese") == 0)
            {
                chinese = TRUE;
                ++first_param;
            }
            else
                exit(printf("Unknown parameter:%s\n", argv[param]));
        }
    }

    link_index = read_index(argv[first_param], &terms_in_index);

    file_number = 1;
    for (param = first_param + 1; param < argc; param++)
    {
        ANT_directory_iterator_recursive disk(argv[param]);  // make the recursive pattern matching as for default files reading
        if (disk.first(&file_object) == NULL)
            file = filename = NULL;
        else
        {
            filename = file_object.filename;
            file = ANT_disk::read_entire_file(filename);
        }
        while (file != NULL)
        {
            current_docid = get_doc_id(file);
            if (current_docid > 0)
            {
//			printf("ID:%d\n", current_docid);
                string_clean(file, lowercase_only, TRUE);

                current = term_list = new char *[strlen(file)];		// this is the worst case by far
                if (chinese)
                    create_utf8_token_list(file, term_list);
                else
                {
                    for (token = strtok(file, seperators); token != NULL; token = strtok(NULL, seperators))
                        *current++ = token;
                    *current = NULL;
                }


                for (first = term_list; *first != NULL; first++)
                {
//				fprintf(stderr, "%s\n", *first);
                    where_to = buffer;
                    for (last = first; *last != NULL; last++)
                    {
                        if (where_to == buffer)
                        {
                            strcpy(buffer, *first);
                            where_to = buffer + strlen(buffer);
                            if (chinese)
                            {
                                if ((*first[0] & 0x80) && isutf8(*first))
                                    is_utf8_token = TRUE;
                                else
                                    is_utf8_token = FALSE;
                            }
                        }
                        else
                        {
                            if (!chinese)
                                *where_to++ = ' ';
                            strcpy(where_to, *last);
                            where_to += strlen(*last);
                        }

                        *where_to = '\0';

                        index_term = find_term_in_list(buffer, link_index, terms_in_index);

                        if (index_term == NULL)
                            break;		// we're after the last term in the list so can stop because we can't be a substring

                        if (chinese)
                        {
                            is_substring = FALSE;
                            cmp = utf8_token_compare(buffer, index_term->term, &is_substring);
                        }
                        else
                            cmp = string_compare(buffer, index_term->term);

                        if (cmp == 0)		// we're a term in the list
                        {
                            index_term->total_occurences++;
                            if (index_term->last_docid != current_docid)
                            {
                                index_term->last_docid = current_docid;
                                index_term->docs_containing_term++;
                            }
                        }
                        else
                        {
                            if (chinese)
                                cmp = is_substring == TRUE ? 0 : 1;
                            else
                                cmp = memcmp(buffer, index_term->term, strlen(buffer));
                            if  (cmp != 0)
                                break;		// we're a not a substring so we can't find a longer term
                        }
                    }
                }
                if (chinese)
                    free_utf8_token_list(term_list);
                delete [] term_list;
                delete [] file;

                if (file_number % 1000 == 0)
                    fprintf(stderr, "Files processed:%d\n", file_number);
                file_number++;
            }
            else
                fprintf(stderr, "Error reading file %s\n", filename);
            //filename = disk.get_next_filename();
            if (disk.next(&file_object) == NULL)
                file = filename = NULL;
            else
            {
                filename = file_object.filename;
                file = ANT_disk::read_entire_file(filename);
            }
        }
    }

    print_answer(link_index, terms_in_index);

    fprintf(stderr, "%s Completed\n", argv[0]);

    return 0;
}
Example #4
0
int fill_protein_info ( FILE * fptr,  char chain, Protein * protein) {

    /* TODO for the moment we rely on PDB annotation
       to extract structural elements - that should be changed */
    Residue * sequence = NULL;
    char line[BUFFLEN];
    char oldresno[PDB_ATOM_RES_NO_LEN+2];
    /* res name: 4 digits + insertion code + \0 */
    char oldrestype [PDB_ATOM_RES_NAME_LEN+2];
    char tmp[BUFFLEN], *auxptr;
    char atomtypes_read_in[BUFFLEN];
    char old_chain;
    int atomctr, resctr,  no_res,ctr, nonblank;
    int retval;
    int chain_found;
    int ca_trace;
    char single_letter ( char code[]);
    
    int has_backbone (Residue * sequence, int from, int to);
    
    /********************************************/
    /********************************************/
    /* cleanup                                  */
    memset (protein, 0, sizeof(Protein) );

    /********************************************/
    /********************************************/
    /* count residues                           */
    memset (line,  0, BUFFLEN);
    memset (oldresno, 0, PDB_ATOM_RES_NO_LEN+2);
    resctr = 0;
    chain_found = 0;
    old_chain = '\0';
    while(fgets(line, BUFFLEN, fptr)!=NULL){
	
	if (resctr) {
	    if ( ! strncmp(line,"END", 3) ||  (chain && line[PDB_ATOM_CHAINID] != old_chain) )
		break;
	}
	if (chain  && line[PDB_ATOM_CHAINID] != chain) continue;
	chain_found  = 1;
	
	if( ! strncmp(line,"ATOM", 4)){
	    
	    if (  strncmp (line+PDB_ATOM_RES_NO, oldresno,  PDB_ATOM_RES_NO_LEN+1) ) {
		
		strncpy (oldresno, line+PDB_ATOM_RES_NO, PDB_ATOM_RES_NO_LEN+1);
		oldresno[PDB_ATOM_RES_NO_LEN+1] = '\0';
		/* handling the case when the chain is not given, meaning: "take the first chain" */ 
		old_chain = line[PDB_ATOM_CHAINID];
		resctr ++;
	    }
	} 
    }

    /* sanity: */
    if ( chain && ! chain_found) {
	fprintf (stderr, "Chain %c not found.\n", chain);
	return ERR_NO_FILE_OR_CHAIN;
    }

    no_res = resctr;
    
    if ( !no_res ) return -1;  /* take it as the end of the read */
    
    /* allocate space */
    sequence = emalloc ( no_res*sizeof (Residue));
    if ( ! sequence ) return 1;
  
    
    /*********************************************/
    /*********************************************/
    /*   read in residue numbers and atom coords */
    rewind ( fptr);
    memset (line,  0, BUFFLEN);
    old_chain = '\0';

    memset (oldresno, 0, PDB_ATOM_RES_NO_LEN+2);
    memset (oldrestype, 0, PDB_ATOM_RES_NAME_LEN+2);
    resctr= -1;
    atomctr = 0;
    ca_trace = 1;
    while(fgets(line, BUFFLEN, fptr)!=NULL){
	
	
	if ( resctr > -1) {
	    if  (! strncmp(line,"END", 3)  ||  (chain && line[PDB_ATOM_CHAINID] != old_chain))
	    break;
	}
	if ( chain  && line[PDB_ATOM_CHAINID] != chain ) continue;
	
	if( ! strncmp(line,"ATOM", 4) ){
 	   /* if it's a hydrogen - skip */
	    if ( line[PDB_ATOM_ATOM_NAME] == 'H'
		 ||  line[PDB_ATOM_ATOM_NAME+1] == 'H') continue;
	    /* adjust the counters */ 
	    if (  strncmp (line+PDB_ATOM_RES_NO, oldresno,  PDB_ATOM_RES_NO_LEN+1) ) {
		/*+1 in  PDB_ATOM_RES_NO_LEN+1 means I am including the insertion code
		  in the identifier */
		strncpy (oldresno, line+PDB_ATOM_RES_NO, PDB_ATOM_RES_NO_LEN+1);
		strncpy (oldrestype, line+PDB_ATOM_RES_NAME, PDB_ATOM_RES_NAME_LEN);
		oldresno[PDB_ATOM_RES_NO_LEN+1]   = '\0';
		oldrestype[PDB_ATOM_RES_NAME_LEN] = '\0';
		
		/* handling the case when the chain is not given, meaning: "take the first chain" */ 
		old_chain = line[PDB_ATOM_CHAINID];
		
		resctr ++;
		if ( resctr >= no_res ) {
		    fprintf (stderr, "Error reading pdb: resctr:%d   no res: %d\n",
			     resctr, no_res);
		    return ERR_NONSENSE;
		}
		atomctr = 0;
		/* keep track of atom types we have read in */
		memset (atomtypes_read_in, 0, BUFFLEN*sizeof(char));
		atomtypes_read_in[0] = '_';
		
		sequence[resctr].no_atoms = 1;
		strncpy ( sequence[resctr].pdb_id, oldresno, PDB_ATOM_RES_NO_LEN+2);
		sequence[resctr].pdb_id[PDB_ATOM_RES_NO_LEN+1]   = '\0';
		
		strncpy ( sequence[resctr].res_type, oldrestype, PDB_ATOM_RES_NAME_LEN+1);
		sequence[resctr].res_type[PDB_ATOM_RES_NAME_LEN] = '\0';
		sequence[resctr].res_type_short  =
		    single_letter ( sequence[resctr].res_type );
		/* modified residues are ok for the purposes here */
		/* unless they are sugars or some such - deal with it below */
		/* by checking the backbone atoms */
		///if ( !sequence[resctr].res_type_short ) return 1;
	   
	    } else {
		atomctr ++;
		sequence[resctr].no_atoms = atomctr + 1;
		if ( atomctr >= MAX_NO_ATOMS ) {
		    fprintf ( stderr,
			      "Error parsing pdb: I thought every aa has < %d atoms.\n",
			      MAX_NO_ATOMS );
		    return ERR_MAX_ATOMS;
		}
	    }
	    /* read in atom info */
	    
	    auxptr = line+ PDB_ATOM_ATOM_NAME;
	    memset ( tmp, 0, PDB_ATOM_ATOM_NAME_LEN+1);
	    /* skip initial blanks*/
	    ctr  = 0;
	    while ( !(isalpha (*(auxptr + ctr))) &&
		    (ctr <= PDB_ATOM_ATOM_NAME_LEN) ) ctr++;
	    /* copy alphanum info */
	    nonblank = 0;
	    while (  isalnum (*(auxptr +ctr))  &&  (ctr <= PDB_ATOM_ATOM_NAME_LEN) ) {
		tmp[nonblank] =  *(auxptr +ctr);
		nonblank ++;
		ctr++;
	    }

	    /* have we already seen this atom type by any chance? */
	    tmp[nonblank] = '_';
	    if ( strstr (atomtypes_read_in, tmp) ) {
		/* ahould I check for an alt location code, or just move on? */
		//printf ( " %s >> %s  //// %s\n", sequence[resctr].pdb_id, atomtypes_read_in, tmp);
		continue;
	    } else {
		sprintf (atomtypes_read_in, "%s%s", atomtypes_read_in, tmp);
	    }
	    tmp[nonblank] = '\0';
	    
	    strncpy ( sequence[resctr].atom[atomctr].type, tmp, PDB_ATOM_ATOM_NAME_LEN );

	    /* is this a backbone atom?*/
	    sequence[resctr].atom[atomctr].backbone = 0;
	    if ( nonblank == 1) {
		  sequence[resctr].atom[atomctr].backbone =
		      !(  strcmp ( tmp, "N") && strcmp ( tmp, "C") && strcmp ( tmp, "O")  );
	    } else if (  nonblank == 2) {
		if (  ! strcmp ( tmp, "CA" )) {
		    sequence[resctr].atom[atomctr].backbone = 1;
		    sequence[resctr].Ca = sequence[resctr].atom+atomctr;
		}  else  {
		    sequence[resctr].atom[atomctr].backbone = 0;
		}
	    }
	    /* check if this is Ca trace */
	    if ( strcmp ( tmp, "CA" ) ) ca_trace = 0;
	    
	    strncpy ( tmp, line+PDB_ATOM_X, PDB_ATOM_X_LEN);
	    tmp[PDB_ATOM_X_LEN] = '\0';
	    sequence[resctr].atom[atomctr].x=atof(tmp);
	    strncpy ( tmp, line+PDB_ATOM_Y, PDB_ATOM_Y_LEN);
	    tmp[PDB_ATOM_Y_LEN] = '\0';
	    sequence[resctr].atom[atomctr].y=atof(tmp);
	    strncpy ( tmp, line+PDB_ATOM_Z, PDB_ATOM_Z_LEN);
	    tmp[PDB_ATOM_Z_LEN] = '\0';
	    sequence[resctr].atom[atomctr].z=atof(tmp);
	   
	}
    }

    if ( ca_trace) return ERR_SSE_NONE|ERR_CA_TRACE;
    
    /* clean PDB id tags from spaces */
    for (resctr=0; resctr < no_res; resctr ++ ) {
	retval = string_clean (sequence[resctr].pdb_id, PDB_ATOM_RES_NO_LEN+1);
	if ( retval ) {
	    fprintf (stderr, "Error in read_pdb(): empty id string for residue with sequential no %d.\n", resctr);
	    return ERR_NONSENSE;
	}
    }

    /* store the sequence and its length */
    protein->sequence = sequence;
    protein->length   = no_res;

    return 0;
}