Example #1
0
int main(int argc, char **argv) {
  bool iflag, dflag, oflag, qflag;
  iflag = dflag = oflag = qflag = false;

  std::string ffindex_header_db_prefix;
  std::string ffindex_sequence_db_prefix;
  std::string ffindex_ca3m_db_prefix;
  std::string ffindex_a3m_db_prefix;

  int c;
  while ((c = getopt(argc, argv, "i:d:o:q:h")) != -1) {
    switch (c) {
      case 'i':
        iflag = 1;
        ffindex_ca3m_db_prefix = optarg;
        break;
      case 'd':
        dflag = 1;
        ffindex_sequence_db_prefix = optarg;
        break;
      case 'o':
        oflag = 1;
        ffindex_a3m_db_prefix = optarg;
        break;
      case 'q':
        qflag = 1;
        ffindex_header_db_prefix = optarg;
        break;
      case 'h':
        usage();
        exit(0);
      case '?':
        if (optopt == 'c')
          fprintf(stderr, "Option -%c requires an argument.\n", optopt);
        else if (isprint(optopt))
          fprintf(stderr, "Unknown option `-%c'.\n", optopt);
        else
          fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt);
        return 1;
      default:
        abort();
    }
  }

  if(!iflag || !dflag || !oflag || !qflag) {
    std::cerr << "Missing arguments!" << std::endl;
    usage();
    exit(0);
  }

  //prepare ffindex a3m database
  std::string a3mDataFile = ffindex_a3m_db_prefix+".ffdata";
  std::string a3mIndexFile = ffindex_a3m_db_prefix+".ffindex";

  FILE *a3m_data_fh  = fopen(a3mDataFile.c_str(), "w");
  FILE *a3m_index_fh = fopen(a3mIndexFile.c_str(), "w");

  if (a3m_data_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex a3m data file! (" << a3mDataFile << ")!" << std::endl;
    exit(1);
  }

  if(a3m_index_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex a3m index file! (" << a3mIndexFile << ")!" << std::endl;
    exit(1);
  }

  size_t a3m_offset = 0;

  //prepare ffindex ca3m database
  std::string ca3mDataFile = ffindex_ca3m_db_prefix+".ffdata";
  std::string ca3mIndexFile = ffindex_ca3m_db_prefix+".ffindex";

  FILE *ca3m_data_fh  = fopen(ca3mDataFile.c_str(), "r");
  FILE *ca3m_index_fh = fopen(ca3mIndexFile.c_str(), "r");

  if (ca3m_data_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex a3m data file! (" << ca3mDataFile << ")!" << std::endl;
    exit(1);
  }

  if(ca3m_index_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex a3m index file! (" << ca3mIndexFile << ")!" << std::endl;
    exit(1);
  }

  size_t ca3m_offset;
  char* ca3m_data = ffindex_mmap_data(ca3m_data_fh, &ca3m_offset);
  ffindex_index_t* ca3m_index = ffindex_index_parse(ca3m_index_fh, 0);

  if(ca3m_index == NULL) {
    std::cerr << "ERROR: CA3M index (" << ca3mIndexFile << ") could not be loaded!" << std::endl;
    exit(1);
  }

  //prepare ffindex sequence database
  std::string sequenceDataFile = ffindex_sequence_db_prefix+".ffdata";
  std::string sequenceIndexFile = ffindex_sequence_db_prefix+".ffindex";

  FILE *sequence_data_fh  = fopen(sequenceDataFile.c_str(), "r");
  FILE *sequence_index_fh = fopen(sequenceIndexFile.c_str(), "r");

  if (sequence_data_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex sequence data file! (" << sequenceDataFile << ")!" << std::endl;
    exit(1);
  }

  if(sequence_index_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex sequence index file! (" << sequenceIndexFile << ")!" << std::endl;
    exit(1);
  }

  size_t sequence_data_size;
  char* sequence_data = ffindex_mmap_data(sequence_data_fh, &sequence_data_size);
  ffindex_index_t* sequence_index = ffindex_index_parse(sequence_index_fh, 80000000);

  if(sequence_index == NULL) {
    std::cerr << "ERROR: Sequence index could not be loaded!" << std::endl;
    exit(1);
  }

  //prepare ffindex header database
  std::string headerDataFile = ffindex_header_db_prefix + ".ffdata";
  std::string headerIndexFile = ffindex_header_db_prefix + ".ffindex";

  FILE *header_data_fh = fopen(headerDataFile.c_str(), "r");
  FILE *header_index_fh = fopen(headerIndexFile.c_str(), "r");

  if (header_data_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex sequence data file! ("
        << headerDataFile << ")!" << std::endl;
    exit(1);
  }

  if (header_index_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex header index file! ("
        << headerIndexFile << ")!" << std::endl;
    exit(1);
  }

  size_t header_data_size;
  char* header_data = ffindex_mmap_data(header_data_fh,
      &header_data_size);
  ffindex_index_t* header_index = ffindex_index_parse(header_index_fh, 1E8);

  if (header_index == NULL) {
    std::cerr << "ERROR: Header index could not be loaded!" << std::endl;
    exit(1);
  }

  //prepare input stream
  size_t ca3m_range_start = 0;
  size_t ca3m_range_end = ca3m_index->n_entries;

  // Foreach entry
  #pragma omp parallel for shared(ca3m_index, ca3m_data, a3m_data_fh, a3m_index_fh, a3m_offset)
  for(size_t entry_index = ca3m_range_start; entry_index < ca3m_range_end; entry_index++)
  {
    ffindex_entry_t* entry = ffindex_get_entry_by_index(ca3m_index, entry_index);
    if(entry == NULL) { perror(entry->name); continue; }

    char* data = ffindex_get_data_by_entry(ca3m_data, entry);

    std::stringstream* out_buffer = new std::stringstream();
    compressed_a3m::extract_a3m(data, entry->length, sequence_index, sequence_data, header_index, header_data, out_buffer);

    std::string out_string = out_buffer->str();

    #pragma omp critical
    {
      ffindex_insert_memory(a3m_data_fh, a3m_index_fh, &a3m_offset, const_cast<char*>(out_string.c_str()), out_string.size(), entry->name);
    }

    delete out_buffer;
  }

  fclose(a3m_data_fh);
  fclose(a3m_index_fh);

  ffsort_index(a3mIndexFile.c_str());
}
int main(int argn, char **argv)
{
  int sort = 0, version = 0;
  int opt, err = EXIT_SUCCESS;
  while ((opt = getopt(argn, argv, "sv")) != -1)
  {
    switch (opt)
    {
      case 's':
        sort = 1;
        break;
      case 'v':
        version = 1;
        break;
      default:
        usage(argv[0]);
        return EXIT_FAILURE;
    }
  }

  if(version == 1)
  {
    /* Don't you dare running it on a platform where byte != 8 bits */
    printf("%s version %.2f, off_t = %zd bits\n", argv[0], FFINDEX_VERSION, sizeof(off_t) * 8);
    return EXIT_SUCCESS;
  }

  if(argn - optind < 3)
  {
    usage(argv[0]);
    return EXIT_FAILURE;
  }


  char *data_header_filename  = argv[optind++];
  char *index_header_filename = argv[optind++];
  char *data_sequence_filename = argv[optind++];
  char *index_sequence_filename = argv[optind++];

  char *fasta_filename = argv[optind++];

  printf("data header file: %s\n", data_header_filename);
  printf("index header file: %s\n", index_header_filename);
  printf("data sequence file: %s\n", data_sequence_filename);
  printf("index sequence file: %s\n", index_sequence_filename);
  printf("fasta file: %s\n", fasta_filename);


  FILE *data_header_file, *index_header_file, *data_sequence_file, *index_sequence_file, *fasta_file;
  size_t header_offset = 0;
  size_t sequence_offset = 0;

  struct stat st;

  // open header ffindex
  if(stat(data_header_filename, &st) == 0) { errno = EEXIST; perror(data_header_filename); return EXIT_FAILURE; }
  data_header_file  = fopen(data_header_filename, "w");
  if( data_header_file == NULL) { perror(data_header_filename); return EXIT_FAILURE; }

  if(stat(index_header_filename, &st) == 0) { errno = EEXIST; perror(index_header_filename); return EXIT_FAILURE; }
  index_header_file = fopen(index_header_filename, "w+");
  if(index_header_file == NULL) { perror(index_header_filename); return EXIT_FAILURE; }

  //open sequence ffindex
  if(stat(data_sequence_filename, &st) == 0) { errno = EEXIST; perror(data_sequence_filename); return EXIT_FAILURE; }
  data_sequence_file  = fopen(data_sequence_filename, "w");
  if( data_sequence_file == NULL) { perror(data_sequence_filename); return EXIT_FAILURE; }

  if(stat(index_sequence_filename, &st) == 0) { errno = EEXIST; perror(index_sequence_filename); return EXIT_FAILURE; }
  index_sequence_file = fopen(index_sequence_filename, "w+");
  if(index_sequence_file == NULL) { perror(index_sequence_filename); return EXIT_FAILURE; }

  fasta_file = fopen(fasta_filename, "r");
  if(fasta_file == NULL) { perror(fasta_filename); return EXIT_FAILURE; }

  size_t fasta_size;
  char *fasta_data = ffindex_mmap_data(fasta_file, &fasta_size);
//  size_t from_length = 0;

  char name[FFINDEX_MAX_ENTRY_NAME_LENTH];
  int seq_id = 1;
  size_t seq_id_length = 0;
  size_t count_ws = 0;

  char header[MAX_ENTRY_LENGTH];
  header[0] = '>';
  size_t header_length = 1;
  char is_header = 1;

  char sequence[MAX_ENTRY_LENGTH];
  size_t sequence_length = 0;

  for(size_t fasta_offset = 1; fasta_offset < fasta_size; fasta_offset++) // position after first ">"
  {
    seq_id_length = 0;
    count_ws = 0;

    is_header = 1;
    header_length = 1;

    sequence_length = 0;

    while(fasta_offset < fasta_size && !(*(fasta_data + fasta_offset) == '>' && *(fasta_data + fasta_offset - 1) == '\n'))
    {
      char input = *(fasta_data + fasta_offset);

      //get fasta name
      if(isspace(input))
      {
        count_ws++;
        name[seq_id_length] = '\0';
      }
      else if(count_ws == 0)
      {
        name[seq_id_length++] = *(fasta_data + fasta_offset);
      }

      if(input == '\n') {
        is_header = 0;
        header[header_length] = '\0';
        sequence[sequence_length] = '\0';
      }
      else {
        if(is_header == 1) {
          header[header_length++] = input;
        }
        else {
          sequence[sequence_length++] = input;
        }
      }

      fasta_offset++;
    }

    if(seq_id_length == 0) {
      sprintf(name, "%d", seq_id);
    }
    seq_id++;

    get_short_id(name, '|', 2);

    ffindex_insert_memory(data_header_file, index_header_file, &header_offset, header, header_length, name);
    ffindex_insert_memory(data_sequence_file, index_sequence_file, &sequence_offset, sequence, sequence_length, name);
  }
  fclose(data_header_file);
  fclose(data_sequence_file);

  fclose(index_header_file);
  fclose(index_sequence_file);

  /* Sort the index entries and write back */
  if(sort) {
    ffsort_index(index_header_filename);
    ffsort_index(index_sequence_filename);
  }

  return err;
}