VALUE method_ffindex_get_data_by_index(VALUE self, VALUE key) { ffindex_db_t * ffindex_db; Data_Get_Struct(self, ffindex_db_t, ffindex_db); size_t index = FIX2INT(key); ffindex_entry_t * entry = ffindex_get_entry_by_index(ffindex_db->ffindex, index); if(entry) { char * data = ffindex_get_data_by_entry(ffindex_db->ffdata, entry); return rb_str_new2(data); } else return Qnil; }
VALUE method_ffindex_get_data_by_name(VALUE self, VALUE key) { Check_Type(key, T_STRING); char * name = calloc(RSTRING_LEN(key) + 1, sizeof(char)); memcpy(name, StringValuePtr(key), RSTRING_LEN(key)); ffindex_db_t * ffindex_db; Data_Get_Struct(self, ffindex_db_t, ffindex_db); ffindex_entry_t * entry = ffindex_get_entry_by_name(ffindex_db->ffindex, name); if(entry) { char * data = ffindex_get_data_by_entry(ffindex_db->ffdata, entry); return rb_str_new2(data); } else return Qnil; }
////////////////////////////////////////////////////////////// // Reading in column state sequences for prefiltering ////////////////////////////////////////////////////////////// void Prefilter::init_prefilter(FFindexDatabase* cs219_database) { // Set up variables for prefiltering num_dbs = cs219_database->db_index->n_entries; first = (unsigned char**) mem_align(ALIGN_FLOAT, num_dbs * sizeof(unsigned char*)); length = (int*) mem_align(ALIGN_FLOAT, num_dbs * sizeof(int)); dbnames = (char**) mem_align(ALIGN_FLOAT, num_dbs * sizeof(char*)); for (size_t n = 0; n < num_dbs; n++) { ffindex_entry_t* entry = ffindex_get_entry_by_index( cs219_database->db_index, n); first[n] = (unsigned char*) ffindex_get_data_by_entry( cs219_database->db_data, entry); length[n] = entry->length - 1; dbnames[n] = new char[strlen(entry->name) + 1]; strcpy(dbnames[n], entry->name); } //check if cs219 format is new binary format checkCSFormat(5); HH_LOG(INFO) << "Searching " << num_dbs << " column state sequences." << std::endl; }
int ffindex_apply_by_entry(char *data, ffindex_index_t* index, ffindex_entry_t* entry, char* program_name, char** program_argv, FILE* data_file_out, FILE* index_file_out, size_t *offset) { int ret = 0; int capture_stdout = (data_file_out != NULL); int pipefd_stdin[2]; int pipefd_stdout[2]; ret = pipe(pipefd_stdin); if(ret != 0) { fprintf(stderr, "ERROR in pipe stdin!\n"); perror(entry->name); return errno; } if(capture_stdout) { ret = pipe(pipefd_stdout); if(ret != 0) { fprintf(stderr, "ERROR in pipe stdout!\n"); perror(entry->name); return errno; } } // Flush so child doesn't copy and also flushes, leading to duplicate output fflush(data_file_out); fflush(index_file_out); pid_t child_pid = fork(); if(child_pid == 0) { close(pipefd_stdin[1]); if(capture_stdout) { fclose(data_file_out); fclose(index_file_out); close(pipefd_stdout[0]); } // Make pipe from parent our new stdin int newfd_in = dup2(pipefd_stdin[0], fileno(stdin)); if(newfd_in < 0) { fprintf(stderr, "ERROR in dup2 in %d %d\n", pipefd_stdin[0], newfd_in); perror(entry->name); } close(pipefd_stdin[0]); if(capture_stdout) { int newfd_out = dup2(pipefd_stdout[1], fileno(stdout)); if(newfd_out < 0) { fprintf(stderr, "ERROR in dup2 out %d %d\n", pipefd_stdout[1], newfd_out); perror(entry->name); } close(pipefd_stdout[1]); } // exec program with the pipe as stdin execvp(program_name, program_argv); // never reached } else if(child_pid > 0) { // parent writes to and possible reads from child int flags = 0; // Read end is for child only close(pipefd_stdin[0]); if(capture_stdout) close(pipefd_stdout[1]); char *filedata = ffindex_get_data_by_entry(data, entry); if(capture_stdout) { flags = fcntl(pipefd_stdout[0], F_GETFL, 0); fcntl(pipefd_stdout[0], F_SETFL, flags | O_NONBLOCK); } // Write file data to child's stdin. ssize_t written = 0; size_t to_write = entry->length - 1; // Don't write ffindex trailing '\0' char* b = read_buffer; while(written < to_write) { size_t rest = to_write - written; int batch_size = PIPE_BUF; if(rest < PIPE_BUF) batch_size = rest; ssize_t w = write(pipefd_stdin[1], filedata + written, batch_size); if(w < 0 && errno != EPIPE) { fprintf(stderr, "ERROR in child!\n"); perror(entry->name); break; } else written += w; if(capture_stdout) { // To avoid blocking try to read some data ssize_t r = read(pipefd_stdout[0], b, PIPE_BUF); if(r > 0) b += r; } } close(pipefd_stdin[1]); // child gets EOF if(capture_stdout) { // Read rest fcntl(pipefd_stdout[0], F_SETFL, flags); // Remove O_NONBLOCK ssize_t r; while((r = read(pipefd_stdout[0], b, PIPE_BUF)) > 0) b += r; close(pipefd_stdout[0]); ffindex_insert_memory(data_file_out, index_file_out, offset, read_buffer, b - read_buffer, entry->name); } waitpid(child_pid, NULL, 0); } else { fprintf(stderr, "ERROR in fork()\n"); perror(entry->name); return errno; } return EXIT_SUCCESS; }
int main(int argc, char **argv) { bool iflag, dflag, oflag, qflag; iflag = dflag = oflag = qflag = false; std::string ffindex_header_db_prefix; std::string ffindex_sequence_db_prefix; std::string ffindex_ca3m_db_prefix; std::string ffindex_a3m_db_prefix; int c; while ((c = getopt(argc, argv, "i:d:o:q:h")) != -1) { switch (c) { case 'i': iflag = 1; ffindex_ca3m_db_prefix = optarg; break; case 'd': dflag = 1; ffindex_sequence_db_prefix = optarg; break; case 'o': oflag = 1; ffindex_a3m_db_prefix = optarg; break; case 'q': qflag = 1; ffindex_header_db_prefix = optarg; break; case 'h': usage(); exit(0); case '?': if (optopt == 'c') fprintf(stderr, "Option -%c requires an argument.\n", optopt); else if (isprint(optopt)) fprintf(stderr, "Unknown option `-%c'.\n", optopt); else fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt); return 1; default: abort(); } } if(!iflag || !dflag || !oflag || !qflag) { std::cerr << "Missing arguments!" << std::endl; usage(); exit(0); } //prepare ffindex a3m database std::string a3mDataFile = ffindex_a3m_db_prefix+".ffdata"; std::string a3mIndexFile = ffindex_a3m_db_prefix+".ffindex"; FILE *a3m_data_fh = fopen(a3mDataFile.c_str(), "w"); FILE *a3m_index_fh = fopen(a3mIndexFile.c_str(), "w"); if (a3m_data_fh == NULL) { std::cerr << "ERROR: Could not open ffindex a3m data file! (" << a3mDataFile << ")!" << std::endl; exit(1); } if(a3m_index_fh == NULL) { std::cerr << "ERROR: Could not open ffindex a3m index file! (" << a3mIndexFile << ")!" << std::endl; exit(1); } size_t a3m_offset = 0; //prepare ffindex ca3m database std::string ca3mDataFile = ffindex_ca3m_db_prefix+".ffdata"; std::string ca3mIndexFile = ffindex_ca3m_db_prefix+".ffindex"; FILE *ca3m_data_fh = fopen(ca3mDataFile.c_str(), "r"); FILE *ca3m_index_fh = fopen(ca3mIndexFile.c_str(), "r"); if (ca3m_data_fh == NULL) { std::cerr << "ERROR: Could not open ffindex a3m data file! (" << ca3mDataFile << ")!" << std::endl; exit(1); } if(ca3m_index_fh == NULL) { std::cerr << "ERROR: Could not open ffindex a3m index file! (" << ca3mIndexFile << ")!" << std::endl; exit(1); } size_t ca3m_offset; char* ca3m_data = ffindex_mmap_data(ca3m_data_fh, &ca3m_offset); ffindex_index_t* ca3m_index = ffindex_index_parse(ca3m_index_fh, 0); if(ca3m_index == NULL) { std::cerr << "ERROR: CA3M index (" << ca3mIndexFile << ") could not be loaded!" << std::endl; exit(1); } //prepare ffindex sequence database std::string sequenceDataFile = ffindex_sequence_db_prefix+".ffdata"; std::string sequenceIndexFile = ffindex_sequence_db_prefix+".ffindex"; FILE *sequence_data_fh = fopen(sequenceDataFile.c_str(), "r"); FILE *sequence_index_fh = fopen(sequenceIndexFile.c_str(), "r"); if (sequence_data_fh == NULL) { std::cerr << "ERROR: Could not open ffindex sequence data file! (" << sequenceDataFile << ")!" << std::endl; exit(1); } if(sequence_index_fh == NULL) { std::cerr << "ERROR: Could not open ffindex sequence index file! (" << sequenceIndexFile << ")!" << std::endl; exit(1); } size_t sequence_data_size; char* sequence_data = ffindex_mmap_data(sequence_data_fh, &sequence_data_size); ffindex_index_t* sequence_index = ffindex_index_parse(sequence_index_fh, 80000000); if(sequence_index == NULL) { std::cerr << "ERROR: Sequence index could not be loaded!" << std::endl; exit(1); } //prepare ffindex header database std::string headerDataFile = ffindex_header_db_prefix + ".ffdata"; std::string headerIndexFile = ffindex_header_db_prefix + ".ffindex"; FILE *header_data_fh = fopen(headerDataFile.c_str(), "r"); FILE *header_index_fh = fopen(headerIndexFile.c_str(), "r"); if (header_data_fh == NULL) { std::cerr << "ERROR: Could not open ffindex sequence data file! (" << headerDataFile << ")!" << std::endl; exit(1); } if (header_index_fh == NULL) { std::cerr << "ERROR: Could not open ffindex header index file! (" << headerIndexFile << ")!" << std::endl; exit(1); } size_t header_data_size; char* header_data = ffindex_mmap_data(header_data_fh, &header_data_size); ffindex_index_t* header_index = ffindex_index_parse(header_index_fh, 1E8); if (header_index == NULL) { std::cerr << "ERROR: Header index could not be loaded!" << std::endl; exit(1); } //prepare input stream size_t ca3m_range_start = 0; size_t ca3m_range_end = ca3m_index->n_entries; // Foreach entry #pragma omp parallel for shared(ca3m_index, ca3m_data, a3m_data_fh, a3m_index_fh, a3m_offset) for(size_t entry_index = ca3m_range_start; entry_index < ca3m_range_end; entry_index++) { ffindex_entry_t* entry = ffindex_get_entry_by_index(ca3m_index, entry_index); if(entry == NULL) { perror(entry->name); continue; } char* data = ffindex_get_data_by_entry(ca3m_data, entry); std::stringstream* out_buffer = new std::stringstream(); compressed_a3m::extract_a3m(data, entry->length, sequence_index, sequence_data, header_index, header_data, out_buffer); std::string out_string = out_buffer->str(); #pragma omp critical { ffindex_insert_memory(a3m_data_fh, a3m_index_fh, &a3m_offset, const_cast<char*>(out_string.c_str()), out_string.size(), entry->name); } delete out_buffer; } fclose(a3m_data_fh); fclose(a3m_index_fh); ffsort_index(a3mIndexFile.c_str()); }
int main(int argc, char **argv) { bool iflag, sflag, oflag = false; std::string set_file; std::string ffindex_oa3m_db_prefix; std::string ffindex_a3m_db_prefix; int c; while ((c = getopt(argc, argv, "i:s:o:h")) != -1) { switch (c) { case 'i': iflag = 1; ffindex_a3m_db_prefix = optarg; break; case 's': sflag = 1; set_file = optarg; break; case 'o': oflag = optarg; ffindex_oa3m_db_prefix = optarg; break; case 'h': usage(); exit(0); case '?': if (optopt == 'c') fprintf(stderr, "Option -%c requires an argument.\n", optopt); else if (isprint(optopt)) fprintf(stderr, "Unknown option `-%c'.\n", optopt); else fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt); return 1; default: abort(); } } if(!iflag || !sflag || !oflag) { std::cerr << "Missing input!" << std::endl; usage(); exit(1); } //prepare ffindex a3m output database std::string oa3mDataFile = ffindex_oa3m_db_prefix+".ffdata"; std::string oa3mIndexFile = ffindex_oa3m_db_prefix+".ffindex"; FILE *oa3m_data_fh = fopen(oa3mDataFile.c_str(), "w"); FILE *oa3m_index_fh = fopen(oa3mIndexFile.c_str(), "w"); if (oa3m_data_fh == NULL) { std::cerr << "ERROR: Could not open ffindex ca3m data file! (" << oa3mDataFile << ")!" << std::endl; exit(1); } if(oa3m_index_fh == NULL) { std::cerr << "ERROR: Could not open ffindex ca3m index file! (" << oa3mIndexFile << ")!" << std::endl; exit(1); } size_t oa3m_offset = 0; //prepare ffindex a3m database std::string a3mDataFile = ffindex_a3m_db_prefix+".ffdata"; std::string a3mIndexFile = ffindex_a3m_db_prefix+".ffindex"; FILE *a3m_data_fh = fopen(a3mDataFile.c_str(), "r"); FILE *a3m_index_fh = fopen(a3mIndexFile.c_str(), "r"); if (a3m_data_fh == NULL) { std::cerr << "ERROR: Could not open ffindex a3m data file! (" << a3mDataFile << ")!" << std::endl; exit(1); } if(a3m_index_fh == NULL) { std::cerr << "ERROR: Could not open ffindex a3m index file! (" << a3mIndexFile << ")!" << std::endl; exit(1); } size_t a3m_offset; char* a3m_data = ffindex_mmap_data(a3m_data_fh, &a3m_offset); ffindex_index_t* a3m_index = ffindex_index_parse(a3m_index_fh, 0); if(a3m_index == NULL) { std::cerr << "ERROR: A3M index could not be loaded!" << std::endl; exit(1); } //prepare filter std::set<std::string> filter; std::ifstream infile(set_file.c_str()); std::string line; while (std::getline(infile, line)) { std::string item = line.substr(0, line.length()); filter.insert(item); } infile.close(); //prepare input stream size_t a3m_range_start = 0; size_t a3m_range_end = a3m_index->n_entries; // Foreach entry #pragma omp parallel for shared(a3m_index, a3m_data, oa3m_data_fh, oa3m_index_fh, oa3m_offset) for(size_t entry_index = a3m_range_start; entry_index < a3m_range_end; entry_index++) { //fprintf(stderr, "index %ld\n", entry_index); ffindex_entry_t* entry = ffindex_get_entry_by_index(a3m_index, entry_index); if(entry == NULL) { perror(entry->name); continue; } char* data = ffindex_get_data_by_entry(a3m_data, entry); std::stringstream* out_buffer = new std::stringstream(); size_t nr_sequences = 0; for(size_t index = 0; index < entry->length; index++) { //write annotation line if(data[index] == '#') { while(data[index] != '\n' && index < entry->length) { out_buffer->put(data[index++]); } out_buffer->put('\n'); } else if(data[index] == '>') { size_t start_index = index; while(index < entry->length && data[index] != '\n') { index++; } //copy line without new line std::string header = std::string(&data[start_index], index - start_index); std::string id = getNameFromHeader(header); bool consensus_flag = isConsensus(id); std::string short_id = getShortIdFromHeader(header); while(index < entry->length - 1 && data[index] != '>') { index++; } if(data[index] == '>' || data[index] == '\0') { index--; } bool passedFilter = false; if(filter.find(short_id) != filter.end()) { nr_sequences++; passedFilter = true; } if(passedFilter || consensus_flag || id.compare("ss_dssp") == 0 || id.compare("sa_dssp") == 0 || id.compare("ss_pred") == 0 || id.compare("ss_conf") == 0) { std::string seq = std::string(&data[start_index], index - start_index); out_buffer->write(seq.c_str(), seq.size()); out_buffer->put('\n'); } } } if(nr_sequences > 0) { std::string out_string = out_buffer->str(); #pragma omp critical { ffindex_insert_memory(oa3m_data_fh, oa3m_index_fh, &oa3m_offset, const_cast<char*>(out_string.c_str()), out_string.size(), entry->name); } } else { std::cerr << "WARNING: No sequences left for cluster " << entry->name << std::endl; } delete out_buffer; } fclose(oa3m_data_fh); }
int main(int argn, char **argv) { int by_index = 0; static struct option long_options[] = { { "byindex", no_argument, NULL, 'n' }, { NULL, 0, NULL, 0 } }; int opt; while (1) { int option_index = 0; opt = getopt_long(argn, argv, "n", long_options, &option_index); if (opt == -1) break; switch (opt) { case 'n': by_index = 1; break; default: usage(argv[0]); return EXIT_FAILURE; } } if(argn < 3) { usage(argv[0]); return EXIT_FAILURE; } char *data_filename = argv[optind++]; char *index_filename = argv[optind++]; FILE *data_file = fopen(data_filename, "r"); FILE *index_file = fopen(index_filename, "r"); if( data_file == NULL) { fferror_print(__FILE__, __LINE__, "ffindex_get", data_filename); exit(EXIT_FAILURE); } if(index_file == NULL) { fferror_print(__FILE__, __LINE__, "ffindex_get", index_filename); exit(EXIT_FAILURE); } size_t data_size; char *data = ffindex_mmap_data(data_file, &data_size); ffindex_index_t* index = ffindex_index_parse(index_file, 0); if(index == NULL) { fferror_print(__FILE__, __LINE__, "ffindex_index_parse", index_filename); exit(EXIT_FAILURE); } if(by_index) { for(int i = optind; i < argn; i++) { size_t index_n = atol(argv[i]) - 1; // offset from 0 but specify from 1 ffindex_entry_t* entry = ffindex_get_entry_by_index(index, index_n); if(entry == NULL) { errno = ENOENT; fferror_print(__FILE__, __LINE__, "ffindex_get entry index out of range", argv[i]); } else { char *filedata = ffindex_get_data_by_entry(data, entry); if(filedata == NULL) { errno = ENOENT; fferror_print(__FILE__, __LINE__, "ffindex_get entry index out of range", argv[i]); } else fwrite(filedata, entry->length - 1, 1, stdout); } } } else // by name { for(int i = optind; i < argn; i++) { char *filename = argv[i]; ffindex_entry_t* entry = ffindex_get_entry_by_name(index, filename); if(entry == NULL) { errno = ENOENT; fferror_print(__FILE__, __LINE__, "ffindex_get key not found in index", filename); } else { char *filedata = ffindex_get_data_by_entry(data, entry); if(filedata == NULL) { errno = ENOENT; fferror_print(__FILE__, __LINE__, "ffindex_get key not found in index", filename); } else fwrite(filedata, entry->length - 1, 1, stdout); } } /* Alternative code using (slower) ffindex_fopen */ /* FILE *file = ffindex_fopen(data, index, filename); if(file == NULL) { errno = ENOENT; fferror_print(__FILE__, __LINE__, "ffindex_fopen file not found in index", filename); } else { char line[LINE_MAX]; while(fgets(line, LINE_MAX, file) != NULL) printf("%s", line); } */ } return 0; }
int main(int argn, char **argv) { if(argn < 4) { fprintf(stderr, "USAGE: %s DATA_FILENAME INDEX_FILENAME PROGRAM [PROGRAM_ARGS]*\n" "\nDesigned and implemented by Andy Hauser <*****@*****.**>.\n", argv[0]); return -1; } char *data_filename = argv[1]; char *index_filename = argv[2]; char *program_name = argv[3]; char **program_argv = argv + 3; FILE *data_file = fopen(data_filename, "r"); FILE *index_file = fopen(index_filename, "r"); if( data_file == NULL) { fferror_print(__FILE__, __LINE__, argv[0], data_filename); exit(EXIT_FAILURE); } if(index_file == NULL) { fferror_print(__FILE__, __LINE__, argv[0], index_filename); exit(EXIT_FAILURE); } size_t data_size; char *data = ffindex_mmap_data(data_file, &data_size); ffindex_index_t* index = ffindex_index_parse(index_file, 0); if(index == NULL) { fferror_print(__FILE__, __LINE__, "ffindex_index_parse", index_filename); exit(EXIT_FAILURE); } // Ignore SIGPIPE struct sigaction handler; handler.sa_handler = SIG_IGN; sigemptyset(&handler.sa_mask); handler.sa_flags = 0; sigaction(SIGPIPE, &handler, NULL); size_t range_start = 0; size_t range_end = index->n_entries; // Foreach entry //#pragma omp parallel for for(size_t entry_index = range_start; entry_index < range_end; entry_index++) { //fprintf(stderr, "index %ld\n", entry_index); int ret = 0; ffindex_entry_t* entry = ffindex_get_entry_by_index(index, entry_index); if(entry == NULL) { perror(entry->name); continue; } int pipefd[2]; ret = pipe(pipefd); if(ret != 0) { perror(entry->name); continue; } pid_t child_pid = fork(); if(child_pid == 0) { fclose(data_file); fclose(index_file); close(pipefd[1]); // Make pipe from parent our new stdin int newfd = dup2(pipefd[0], fileno(stdin)); if(newfd < 0) { fprintf(stdout, "%d %d\n", pipefd[0], newfd); perror(entry->name); } close(pipefd[0]); // exec program with the pipe as stdin execvp(program_name, program_argv); // never reached } else if(child_pid > 0) { // Read end is for child only close(pipefd[0]); // Write file data to child's stdin. char *filedata = ffindex_get_data_by_entry(data, entry); ssize_t written = 0; while(written < entry->length) { int w = write(pipefd[1], filedata + written, entry->length - written); if(w < 0 && errno != EPIPE) { perror(entry->name); break; } else if(w == 0 && errno != 0) { perror(entry->name); break; } else written += w; } close(pipefd[1]); // child gets EOF waitpid(child_pid, NULL, 0); } else { perror(entry->name); exit(errno); } } return 0; }