int main(int argc, char **argv) { bool iflag, dflag, oflag, qflag; iflag = dflag = oflag = qflag = false; std::string ffindex_header_db_prefix; std::string ffindex_sequence_db_prefix; std::string ffindex_ca3m_db_prefix; std::string ffindex_a3m_db_prefix; int c; while ((c = getopt(argc, argv, "i:d:o:q:h")) != -1) { switch (c) { case 'i': iflag = 1; ffindex_ca3m_db_prefix = optarg; break; case 'd': dflag = 1; ffindex_sequence_db_prefix = optarg; break; case 'o': oflag = 1; ffindex_a3m_db_prefix = optarg; break; case 'q': qflag = 1; ffindex_header_db_prefix = optarg; break; case 'h': usage(); exit(0); case '?': if (optopt == 'c') fprintf(stderr, "Option -%c requires an argument.\n", optopt); else if (isprint(optopt)) fprintf(stderr, "Unknown option `-%c'.\n", optopt); else fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt); return 1; default: abort(); } } if(!iflag || !dflag || !oflag || !qflag) { std::cerr << "Missing arguments!" << std::endl; usage(); exit(0); } //prepare ffindex a3m database std::string a3mDataFile = ffindex_a3m_db_prefix+".ffdata"; std::string a3mIndexFile = ffindex_a3m_db_prefix+".ffindex"; FILE *a3m_data_fh = fopen(a3mDataFile.c_str(), "w"); FILE *a3m_index_fh = fopen(a3mIndexFile.c_str(), "w"); if (a3m_data_fh == NULL) { std::cerr << "ERROR: Could not open ffindex a3m data file! (" << a3mDataFile << ")!" << std::endl; exit(1); } if(a3m_index_fh == NULL) { std::cerr << "ERROR: Could not open ffindex a3m index file! (" << a3mIndexFile << ")!" << std::endl; exit(1); } size_t a3m_offset = 0; //prepare ffindex ca3m database std::string ca3mDataFile = ffindex_ca3m_db_prefix+".ffdata"; std::string ca3mIndexFile = ffindex_ca3m_db_prefix+".ffindex"; FILE *ca3m_data_fh = fopen(ca3mDataFile.c_str(), "r"); FILE *ca3m_index_fh = fopen(ca3mIndexFile.c_str(), "r"); if (ca3m_data_fh == NULL) { std::cerr << "ERROR: Could not open ffindex a3m data file! (" << ca3mDataFile << ")!" << std::endl; exit(1); } if(ca3m_index_fh == NULL) { std::cerr << "ERROR: Could not open ffindex a3m index file! (" << ca3mIndexFile << ")!" << std::endl; exit(1); } size_t ca3m_offset; char* ca3m_data = ffindex_mmap_data(ca3m_data_fh, &ca3m_offset); ffindex_index_t* ca3m_index = ffindex_index_parse(ca3m_index_fh, 0); if(ca3m_index == NULL) { std::cerr << "ERROR: CA3M index (" << ca3mIndexFile << ") could not be loaded!" << std::endl; exit(1); } //prepare ffindex sequence database std::string sequenceDataFile = ffindex_sequence_db_prefix+".ffdata"; std::string sequenceIndexFile = ffindex_sequence_db_prefix+".ffindex"; FILE *sequence_data_fh = fopen(sequenceDataFile.c_str(), "r"); FILE *sequence_index_fh = fopen(sequenceIndexFile.c_str(), "r"); if (sequence_data_fh == NULL) { std::cerr << "ERROR: Could not open ffindex sequence data file! (" << sequenceDataFile << ")!" << std::endl; exit(1); } if(sequence_index_fh == NULL) { std::cerr << "ERROR: Could not open ffindex sequence index file! (" << sequenceIndexFile << ")!" << std::endl; exit(1); } size_t sequence_data_size; char* sequence_data = ffindex_mmap_data(sequence_data_fh, &sequence_data_size); ffindex_index_t* sequence_index = ffindex_index_parse(sequence_index_fh, 80000000); if(sequence_index == NULL) { std::cerr << "ERROR: Sequence index could not be loaded!" << std::endl; exit(1); } //prepare ffindex header database std::string headerDataFile = ffindex_header_db_prefix + ".ffdata"; std::string headerIndexFile = ffindex_header_db_prefix + ".ffindex"; FILE *header_data_fh = fopen(headerDataFile.c_str(), "r"); FILE *header_index_fh = fopen(headerIndexFile.c_str(), "r"); if (header_data_fh == NULL) { std::cerr << "ERROR: Could not open ffindex sequence data file! (" << headerDataFile << ")!" << std::endl; exit(1); } if (header_index_fh == NULL) { std::cerr << "ERROR: Could not open ffindex header index file! (" << headerIndexFile << ")!" << std::endl; exit(1); } size_t header_data_size; char* header_data = ffindex_mmap_data(header_data_fh, &header_data_size); ffindex_index_t* header_index = ffindex_index_parse(header_index_fh, 1E8); if (header_index == NULL) { std::cerr << "ERROR: Header index could not be loaded!" << std::endl; exit(1); } //prepare input stream size_t ca3m_range_start = 0; size_t ca3m_range_end = ca3m_index->n_entries; // Foreach entry #pragma omp parallel for shared(ca3m_index, ca3m_data, a3m_data_fh, a3m_index_fh, a3m_offset) for(size_t entry_index = ca3m_range_start; entry_index < ca3m_range_end; entry_index++) { ffindex_entry_t* entry = ffindex_get_entry_by_index(ca3m_index, entry_index); if(entry == NULL) { perror(entry->name); continue; } char* data = ffindex_get_data_by_entry(ca3m_data, entry); std::stringstream* out_buffer = new std::stringstream(); compressed_a3m::extract_a3m(data, entry->length, sequence_index, sequence_data, header_index, header_data, out_buffer); std::string out_string = out_buffer->str(); #pragma omp critical { ffindex_insert_memory(a3m_data_fh, a3m_index_fh, &a3m_offset, const_cast<char*>(out_string.c_str()), out_string.size(), entry->name); } delete out_buffer; } fclose(a3m_data_fh); fclose(a3m_index_fh); ffsort_index(a3mIndexFile.c_str()); }
int main(int argn, char **argv) { int sort = 0, version = 0; int opt, err = EXIT_SUCCESS; while ((opt = getopt(argn, argv, "sv")) != -1) { switch (opt) { case 's': sort = 1; break; case 'v': version = 1; break; default: usage(argv[0]); return EXIT_FAILURE; } } if(version == 1) { /* Don't you dare running it on a platform where byte != 8 bits */ printf("%s version %.2f, off_t = %zd bits\n", argv[0], FFINDEX_VERSION, sizeof(off_t) * 8); return EXIT_SUCCESS; } if(argn - optind < 3) { usage(argv[0]); return EXIT_FAILURE; } char *data_header_filename = argv[optind++]; char *index_header_filename = argv[optind++]; char *data_sequence_filename = argv[optind++]; char *index_sequence_filename = argv[optind++]; char *fasta_filename = argv[optind++]; printf("data header file: %s\n", data_header_filename); printf("index header file: %s\n", index_header_filename); printf("data sequence file: %s\n", data_sequence_filename); printf("index sequence file: %s\n", index_sequence_filename); printf("fasta file: %s\n", fasta_filename); FILE *data_header_file, *index_header_file, *data_sequence_file, *index_sequence_file, *fasta_file; size_t header_offset = 0; size_t sequence_offset = 0; struct stat st; // open header ffindex if(stat(data_header_filename, &st) == 0) { errno = EEXIST; perror(data_header_filename); return EXIT_FAILURE; } data_header_file = fopen(data_header_filename, "w"); if( data_header_file == NULL) { perror(data_header_filename); return EXIT_FAILURE; } if(stat(index_header_filename, &st) == 0) { errno = EEXIST; perror(index_header_filename); return EXIT_FAILURE; } index_header_file = fopen(index_header_filename, "w+"); if(index_header_file == NULL) { perror(index_header_filename); return EXIT_FAILURE; } //open sequence ffindex if(stat(data_sequence_filename, &st) == 0) { errno = EEXIST; perror(data_sequence_filename); return EXIT_FAILURE; } data_sequence_file = fopen(data_sequence_filename, "w"); if( data_sequence_file == NULL) { perror(data_sequence_filename); return EXIT_FAILURE; } if(stat(index_sequence_filename, &st) == 0) { errno = EEXIST; perror(index_sequence_filename); return EXIT_FAILURE; } index_sequence_file = fopen(index_sequence_filename, "w+"); if(index_sequence_file == NULL) { perror(index_sequence_filename); return EXIT_FAILURE; } fasta_file = fopen(fasta_filename, "r"); if(fasta_file == NULL) { perror(fasta_filename); return EXIT_FAILURE; } size_t fasta_size; char *fasta_data = ffindex_mmap_data(fasta_file, &fasta_size); // size_t from_length = 0; char name[FFINDEX_MAX_ENTRY_NAME_LENTH]; int seq_id = 1; size_t seq_id_length = 0; size_t count_ws = 0; char header[MAX_ENTRY_LENGTH]; header[0] = '>'; size_t header_length = 1; char is_header = 1; char sequence[MAX_ENTRY_LENGTH]; size_t sequence_length = 0; for(size_t fasta_offset = 1; fasta_offset < fasta_size; fasta_offset++) // position after first ">" { seq_id_length = 0; count_ws = 0; is_header = 1; header_length = 1; sequence_length = 0; while(fasta_offset < fasta_size && !(*(fasta_data + fasta_offset) == '>' && *(fasta_data + fasta_offset - 1) == '\n')) { char input = *(fasta_data + fasta_offset); //get fasta name if(isspace(input)) { count_ws++; name[seq_id_length] = '\0'; } else if(count_ws == 0) { name[seq_id_length++] = *(fasta_data + fasta_offset); } if(input == '\n') { is_header = 0; header[header_length] = '\0'; sequence[sequence_length] = '\0'; } else { if(is_header == 1) { header[header_length++] = input; } else { sequence[sequence_length++] = input; } } fasta_offset++; } if(seq_id_length == 0) { sprintf(name, "%d", seq_id); } seq_id++; get_short_id(name, '|', 2); ffindex_insert_memory(data_header_file, index_header_file, &header_offset, header, header_length, name); ffindex_insert_memory(data_sequence_file, index_sequence_file, &sequence_offset, sequence, sequence_length, name); } fclose(data_header_file); fclose(data_sequence_file); fclose(index_header_file); fclose(index_sequence_file); /* Sort the index entries and write back */ if(sort) { ffsort_index(index_header_filename); ffsort_index(index_sequence_filename); } return err; }