Esempio n. 1
0
ffindex_index_t* ffindex_index_parse(FILE *index_file, size_t num_max_entries)
{
  if(num_max_entries == 0)
    num_max_entries = FFINDEX_MAX_INDEX_ENTRIES_DEFAULT;
  size_t nbytes = sizeof(ffindex_index_t) + (sizeof(ffindex_entry_t) * num_max_entries);
  ffindex_index_t *index = (ffindex_index_t *)malloc(nbytes);
  index->num_max_entries = num_max_entries;
  if(index == NULL)
  {
    fferror_print(__FILE__, __LINE__, __func__, "malloc failed");
    return NULL;
  }

  index->file = index_file;
  index->index_data = ffindex_mmap_data(index_file, &(index->index_data_size));
  index->type = SORTED_ARRAY; /* Assume a sorted file for now */
  int i = 0;
  char* d = index->index_data;
  char* end;
  /* Faster than scanf per line */
  for(i = 0; d < (index->index_data + index->index_data_size); i++)
  {
    int p;
    for(p = 0; *d != '\t'; d++)
      index->entries[i].name[p++] = *d;
    index->entries[i].name[p] = '\0';
    index->entries[i].offset = strtol(d, &end, 10);
    d = end;
    index->entries[i].length  = strtol(d, &end, 10);
    d = end + 1; /* +1 for newline */
  }

  index->n_entries = i;

  if(index->n_entries == 0)
    return NULL;

  return index;
}
Esempio n. 2
0
int main(int argc, char **argv) {
  bool iflag, dflag, oflag = false;

  std::string ffindex_sequence_db_prefix;
  std::string output;
  std::string input;

  int c;
  while ((c = getopt(argc, argv, "i:d:o:h")) != -1) {
    switch (c) {
      case 'i':
        iflag = 1;
        input = optarg;
        break;
      case 'd':
        dflag = 1;
        ffindex_sequence_db_prefix = optarg;
        break;
      case 'o':
        oflag = optarg;
        output = optarg;
        break;
      case 'h':
        usage();
        exit(0);
      case '?':
        if (optopt == 'c')
          fprintf(stderr, "Option -%c requires an argument.\n", optopt);
        else if (isprint(optopt))
          fprintf(stderr, "Unknown option `-%c'.\n", optopt);
        else
          fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt);
        return 1;
      default:
        abort();
    }
  }

  if(!iflag || !dflag || !oflag) {
    usage();
    exit(0);
  }

  //prepare ffindex_database
  std::string sequenceDataFile = ffindex_sequence_db_prefix+".ffdata";
  std::string sequenceIndexFile = ffindex_sequence_db_prefix+".ffindex";

  FILE *sequence_data_fh  = fopen(sequenceDataFile.c_str(), "r");
  FILE *sequence_index_fh = fopen(sequenceIndexFile.c_str(), "r");

  if (sequence_data_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex sequence data file! (" << sequenceDataFile << ")!" << std::endl;
    exit(1);
  }

  if(sequence_index_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex sequence index file! (" << sequenceIndexFile << ")!" << std::endl;
    exit(1);
  }

  size_t sequence_data_size;
  char* sequence_data = ffindex_mmap_data(sequence_data_fh, &sequence_data_size);
  ffindex_index_t* sequence_index = ffindex_index_parse(sequence_index_fh, 80000000);

  if(sequence_index == NULL) {
    std::cerr << "ERROR: Sequence index could not be loaded!" << std::endl;
    exit(1);
  }

  //prepare input stream
  std::istream* in;
  if (input.compare("stdin") != 0) {
    in = new std::ifstream(input.c_str(), std::ios::binary | std::ios::in);
  }
  else {
    in = &std::cin;
  }

  std::stringstream* out_buffer = new std::stringstream();
  int ret = compressed_a3m::compress_a3m(in, sequence_index, sequence_data, out_buffer);

  if(ret) {
    //prepare output
    if (output.compare("stdout") != 0) {
      std::ofstream out(output.c_str(), std::ios::binary | std::ios::out);
      out << out_buffer->str();
      out.close();
    }
    else {
      std::cout << out_buffer->str();
    }
    return 0;
  }
  else {
    std::cerr << "ERROR: Could not compress A3M! ("<< input << ")" << std::endl;
    return 1;
  }
}
Esempio n. 3
0
int main(int argn, char **argv)
{
  int mpi_error,
      mpi_rank,
      mpi_num_procs;

  mpi_error = MPI_Init(&argn, &argv);
  mpi_error = MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  mpi_error = MPI_Comm_size(MPI_COMM_WORLD, &mpi_num_procs);

  int opt;
  char *data_filename_out  = NULL,
       *index_filename_out = NULL;

  while ((opt = getopt(argn, argv, "d:i:")) != -1)
  {
    switch (opt)
    {
      case 'd':
        data_filename_out = optarg;
        break;
      case 'i':
        index_filename_out = optarg;
        break;
    }
  }

  if(argn - optind < 3)
  {
    fprintf(stderr, "Not enough arguments %d.\n", optind - argn);
    fprintf(stderr, "USAGE: %s -d DATA_FILENAME_OUT -i INDEX_FILENAME_OUT DATA_FILENAME INDEX_FILENAME -- PROGRAM [PROGRAM_ARGS]*\n"
                    "\nDesigned and implemented by Andy Hauser <*****@*****.**>.\n",
                    basename(argv[0]));
    return -1;
  }
  read_buffer = malloc(400 * 1024 * 1024);
  char *data_filename  = argv[optind++];
  char *index_filename = argv[optind++];
  char *program_name   = argv[optind];
  char **program_argv = argv + optind;

  FILE *data_file  = fopen(data_filename,  "r");
  FILE *index_file = fopen(index_filename, "r");

  if( data_file == NULL) { fferror_print(__FILE__, __LINE__, argv[0], data_filename);  exit(EXIT_FAILURE); }
  if(index_file == NULL) { fferror_print(__FILE__, __LINE__, argv[0], index_filename);  exit(EXIT_FAILURE); }

  FILE *data_file_out = NULL, *index_file_out = NULL;
  // Setup one output FFindex for each MPI process
  if(data_filename_out != NULL && index_filename_out != NULL)
  {
    char* data_filename_out_rank  = malloc(FILENAME_MAX);
    char* index_filename_out_rank = malloc(FILENAME_MAX);
    snprintf( data_filename_out_rank, FILENAME_MAX, "%s.%d", data_filename_out,  mpi_rank);
    snprintf(index_filename_out_rank, FILENAME_MAX, "%s.%d", index_filename_out, mpi_rank);
    data_file_out  = fopen(data_filename_out_rank,  "w+");
    index_file_out = fopen(index_filename_out_rank, "w+");

    if( data_file_out == NULL) { fferror_print(__FILE__, __LINE__, argv[0], data_filename_out);  exit(EXIT_FAILURE); }
    if(index_file_out == NULL) { fferror_print(__FILE__, __LINE__, argv[0], index_filename_out);  exit(EXIT_FAILURE); }
  }

  int capture_stdout = (data_file_out != NULL);

  size_t data_size;
  char *data = ffindex_mmap_data(data_file, &data_size);

  ffindex_index_t* index = ffindex_index_parse(index_file, 0);
  if(index == NULL)
  {
    fferror_print(__FILE__, __LINE__, "ffindex_index_parse", index_filename);
    MPI_Finalize();
    exit(EXIT_FAILURE);
  }
  
  // Ignore SIGPIPE
  struct sigaction handler;
  handler.sa_handler = SIG_IGN;
  sigemptyset(&handler.sa_mask);
  handler.sa_flags = 0;
  sigaction(SIGPIPE, &handler, NULL);

  size_t batch_size, range_start, range_end;

  if(index->n_entries >= mpi_num_procs)
    batch_size = index->n_entries / mpi_num_procs;
  else
    batch_size = 0;
  range_start = mpi_rank * batch_size;
  range_end = range_start + batch_size;


  size_t offset = 0;
  // Foreach entry
  if(batch_size > 0)
    for(size_t entry_index = range_start; entry_index < range_end; entry_index++)
    {
      ffindex_entry_t* entry = ffindex_get_entry_by_index(index, entry_index);
      if(entry == NULL) { perror(entry->name); return errno; }
      int error = ffindex_apply_by_entry(data, index, entry, program_name, program_argv, data_file_out, index_file_out, &offset);
      if(error != 0)
        { perror(entry->name); break; }
    }
  ssize_t left_over = index->n_entries - (batch_size * mpi_num_procs);
  if(mpi_rank < left_over)
  {
    size_t left_over_entry_index = (batch_size * mpi_num_procs) + mpi_rank;
    ffindex_entry_t* entry = ffindex_get_entry_by_index(index, left_over_entry_index);
    if(entry == NULL) { perror(entry->name); return errno; }
    //fprintf(stderr, "handling left over: %ld\n", left_over_entry_index);
    int error = ffindex_apply_by_entry(data, index, entry, program_name, program_argv, data_file_out, index_file_out, &offset);
    if(error != 0)
      perror(entry->name);
  }

  if(capture_stdout)
    fclose(data_file_out);
  if(index_file_out != NULL)
    fclose(index_file_out);

  MPI_Barrier(MPI_COMM_WORLD);


  // merge FFindexes in master
  if(data_filename_out != NULL && mpi_rank == 0)
  {
    char* merge_command  = malloc(FILENAME_MAX * 5);
    for(int i = 0; i < mpi_num_procs; i++)
    {
      snprintf( merge_command, FILENAME_MAX, "ffindex_build -as %s %s -d %s.%d -i %s.%d",
                data_filename_out, index_filename_out, data_filename_out, i, index_filename_out, i);
      //puts(merge_command);
      system(merge_command);
    }
  }

  MPI_Finalize();

  return EXIT_SUCCESS;
}
Esempio n. 4
0
int main(int argc, char **argv) {
  bool iflag, dflag, oflag, qflag;
  iflag = dflag = oflag = qflag = false;

  std::string ffindex_header_db_prefix;
  std::string ffindex_sequence_db_prefix;
  std::string ffindex_ca3m_db_prefix;
  std::string ffindex_a3m_db_prefix;

  int c;
  while ((c = getopt(argc, argv, "i:d:o:q:h")) != -1) {
    switch (c) {
      case 'i':
        iflag = 1;
        ffindex_ca3m_db_prefix = optarg;
        break;
      case 'd':
        dflag = 1;
        ffindex_sequence_db_prefix = optarg;
        break;
      case 'o':
        oflag = 1;
        ffindex_a3m_db_prefix = optarg;
        break;
      case 'q':
        qflag = 1;
        ffindex_header_db_prefix = optarg;
        break;
      case 'h':
        usage();
        exit(0);
      case '?':
        if (optopt == 'c')
          fprintf(stderr, "Option -%c requires an argument.\n", optopt);
        else if (isprint(optopt))
          fprintf(stderr, "Unknown option `-%c'.\n", optopt);
        else
          fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt);
        return 1;
      default:
        abort();
    }
  }

  if(!iflag || !dflag || !oflag || !qflag) {
    std::cerr << "Missing arguments!" << std::endl;
    usage();
    exit(0);
  }

  //prepare ffindex a3m database
  std::string a3mDataFile = ffindex_a3m_db_prefix+".ffdata";
  std::string a3mIndexFile = ffindex_a3m_db_prefix+".ffindex";

  FILE *a3m_data_fh  = fopen(a3mDataFile.c_str(), "w");
  FILE *a3m_index_fh = fopen(a3mIndexFile.c_str(), "w");

  if (a3m_data_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex a3m data file! (" << a3mDataFile << ")!" << std::endl;
    exit(1);
  }

  if(a3m_index_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex a3m index file! (" << a3mIndexFile << ")!" << std::endl;
    exit(1);
  }

  size_t a3m_offset = 0;

  //prepare ffindex ca3m database
  std::string ca3mDataFile = ffindex_ca3m_db_prefix+".ffdata";
  std::string ca3mIndexFile = ffindex_ca3m_db_prefix+".ffindex";

  FILE *ca3m_data_fh  = fopen(ca3mDataFile.c_str(), "r");
  FILE *ca3m_index_fh = fopen(ca3mIndexFile.c_str(), "r");

  if (ca3m_data_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex a3m data file! (" << ca3mDataFile << ")!" << std::endl;
    exit(1);
  }

  if(ca3m_index_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex a3m index file! (" << ca3mIndexFile << ")!" << std::endl;
    exit(1);
  }

  size_t ca3m_offset;
  char* ca3m_data = ffindex_mmap_data(ca3m_data_fh, &ca3m_offset);
  ffindex_index_t* ca3m_index = ffindex_index_parse(ca3m_index_fh, 0);

  if(ca3m_index == NULL) {
    std::cerr << "ERROR: CA3M index (" << ca3mIndexFile << ") could not be loaded!" << std::endl;
    exit(1);
  }

  //prepare ffindex sequence database
  std::string sequenceDataFile = ffindex_sequence_db_prefix+".ffdata";
  std::string sequenceIndexFile = ffindex_sequence_db_prefix+".ffindex";

  FILE *sequence_data_fh  = fopen(sequenceDataFile.c_str(), "r");
  FILE *sequence_index_fh = fopen(sequenceIndexFile.c_str(), "r");

  if (sequence_data_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex sequence data file! (" << sequenceDataFile << ")!" << std::endl;
    exit(1);
  }

  if(sequence_index_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex sequence index file! (" << sequenceIndexFile << ")!" << std::endl;
    exit(1);
  }

  size_t sequence_data_size;
  char* sequence_data = ffindex_mmap_data(sequence_data_fh, &sequence_data_size);
  ffindex_index_t* sequence_index = ffindex_index_parse(sequence_index_fh, 80000000);

  if(sequence_index == NULL) {
    std::cerr << "ERROR: Sequence index could not be loaded!" << std::endl;
    exit(1);
  }

  //prepare ffindex header database
  std::string headerDataFile = ffindex_header_db_prefix + ".ffdata";
  std::string headerIndexFile = ffindex_header_db_prefix + ".ffindex";

  FILE *header_data_fh = fopen(headerDataFile.c_str(), "r");
  FILE *header_index_fh = fopen(headerIndexFile.c_str(), "r");

  if (header_data_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex sequence data file! ("
        << headerDataFile << ")!" << std::endl;
    exit(1);
  }

  if (header_index_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex header index file! ("
        << headerIndexFile << ")!" << std::endl;
    exit(1);
  }

  size_t header_data_size;
  char* header_data = ffindex_mmap_data(header_data_fh,
      &header_data_size);
  ffindex_index_t* header_index = ffindex_index_parse(header_index_fh, 1E8);

  if (header_index == NULL) {
    std::cerr << "ERROR: Header index could not be loaded!" << std::endl;
    exit(1);
  }

  //prepare input stream
  size_t ca3m_range_start = 0;
  size_t ca3m_range_end = ca3m_index->n_entries;

  // Foreach entry
  #pragma omp parallel for shared(ca3m_index, ca3m_data, a3m_data_fh, a3m_index_fh, a3m_offset)
  for(size_t entry_index = ca3m_range_start; entry_index < ca3m_range_end; entry_index++)
  {
    ffindex_entry_t* entry = ffindex_get_entry_by_index(ca3m_index, entry_index);
    if(entry == NULL) { perror(entry->name); continue; }

    char* data = ffindex_get_data_by_entry(ca3m_data, entry);

    std::stringstream* out_buffer = new std::stringstream();
    compressed_a3m::extract_a3m(data, entry->length, sequence_index, sequence_data, header_index, header_data, out_buffer);

    std::string out_string = out_buffer->str();

    #pragma omp critical
    {
      ffindex_insert_memory(a3m_data_fh, a3m_index_fh, &a3m_offset, const_cast<char*>(out_string.c_str()), out_string.size(), entry->name);
    }

    delete out_buffer;
  }

  fclose(a3m_data_fh);
  fclose(a3m_index_fh);

  ffsort_index(a3mIndexFile.c_str());
}
int main(int argn, char **argv)
{
  int sort = 0, version = 0;
  int opt, err = EXIT_SUCCESS;
  while ((opt = getopt(argn, argv, "sv")) != -1)
  {
    switch (opt)
    {
      case 's':
        sort = 1;
        break;
      case 'v':
        version = 1;
        break;
      default:
        usage(argv[0]);
        return EXIT_FAILURE;
    }
  }

  if(version == 1)
  {
    /* Don't you dare running it on a platform where byte != 8 bits */
    printf("%s version %.2f, off_t = %zd bits\n", argv[0], FFINDEX_VERSION, sizeof(off_t) * 8);
    return EXIT_SUCCESS;
  }

  if(argn - optind < 3)
  {
    usage(argv[0]);
    return EXIT_FAILURE;
  }


  char *data_header_filename  = argv[optind++];
  char *index_header_filename = argv[optind++];
  char *data_sequence_filename = argv[optind++];
  char *index_sequence_filename = argv[optind++];

  char *fasta_filename = argv[optind++];

  printf("data header file: %s\n", data_header_filename);
  printf("index header file: %s\n", index_header_filename);
  printf("data sequence file: %s\n", data_sequence_filename);
  printf("index sequence file: %s\n", index_sequence_filename);
  printf("fasta file: %s\n", fasta_filename);


  FILE *data_header_file, *index_header_file, *data_sequence_file, *index_sequence_file, *fasta_file;
  size_t header_offset = 0;
  size_t sequence_offset = 0;

  struct stat st;

  // open header ffindex
  if(stat(data_header_filename, &st) == 0) { errno = EEXIST; perror(data_header_filename); return EXIT_FAILURE; }
  data_header_file  = fopen(data_header_filename, "w");
  if( data_header_file == NULL) { perror(data_header_filename); return EXIT_FAILURE; }

  if(stat(index_header_filename, &st) == 0) { errno = EEXIST; perror(index_header_filename); return EXIT_FAILURE; }
  index_header_file = fopen(index_header_filename, "w+");
  if(index_header_file == NULL) { perror(index_header_filename); return EXIT_FAILURE; }

  //open sequence ffindex
  if(stat(data_sequence_filename, &st) == 0) { errno = EEXIST; perror(data_sequence_filename); return EXIT_FAILURE; }
  data_sequence_file  = fopen(data_sequence_filename, "w");
  if( data_sequence_file == NULL) { perror(data_sequence_filename); return EXIT_FAILURE; }

  if(stat(index_sequence_filename, &st) == 0) { errno = EEXIST; perror(index_sequence_filename); return EXIT_FAILURE; }
  index_sequence_file = fopen(index_sequence_filename, "w+");
  if(index_sequence_file == NULL) { perror(index_sequence_filename); return EXIT_FAILURE; }

  fasta_file = fopen(fasta_filename, "r");
  if(fasta_file == NULL) { perror(fasta_filename); return EXIT_FAILURE; }

  size_t fasta_size;
  char *fasta_data = ffindex_mmap_data(fasta_file, &fasta_size);
//  size_t from_length = 0;

  char name[FFINDEX_MAX_ENTRY_NAME_LENTH];
  int seq_id = 1;
  size_t seq_id_length = 0;
  size_t count_ws = 0;

  char header[MAX_ENTRY_LENGTH];
  header[0] = '>';
  size_t header_length = 1;
  char is_header = 1;

  char sequence[MAX_ENTRY_LENGTH];
  size_t sequence_length = 0;

  for(size_t fasta_offset = 1; fasta_offset < fasta_size; fasta_offset++) // position after first ">"
  {
    seq_id_length = 0;
    count_ws = 0;

    is_header = 1;
    header_length = 1;

    sequence_length = 0;

    while(fasta_offset < fasta_size && !(*(fasta_data + fasta_offset) == '>' && *(fasta_data + fasta_offset - 1) == '\n'))
    {
      char input = *(fasta_data + fasta_offset);

      //get fasta name
      if(isspace(input))
      {
        count_ws++;
        name[seq_id_length] = '\0';
      }
      else if(count_ws == 0)
      {
        name[seq_id_length++] = *(fasta_data + fasta_offset);
      }

      if(input == '\n') {
        is_header = 0;
        header[header_length] = '\0';
        sequence[sequence_length] = '\0';
      }
      else {
        if(is_header == 1) {
          header[header_length++] = input;
        }
        else {
          sequence[sequence_length++] = input;
        }
      }

      fasta_offset++;
    }

    if(seq_id_length == 0) {
      sprintf(name, "%d", seq_id);
    }
    seq_id++;

    get_short_id(name, '|', 2);

    ffindex_insert_memory(data_header_file, index_header_file, &header_offset, header, header_length, name);
    ffindex_insert_memory(data_sequence_file, index_sequence_file, &sequence_offset, sequence, sequence_length, name);
  }
  fclose(data_header_file);
  fclose(data_sequence_file);

  fclose(index_header_file);
  fclose(index_sequence_file);

  /* Sort the index entries and write back */
  if(sort) {
    ffsort_index(index_header_filename);
    ffsort_index(index_sequence_filename);
  }

  return err;
}
int main(int argc, char **argv) {
  bool iflag, sflag, oflag = false;

  std::string set_file;
  std::string ffindex_oa3m_db_prefix;
  std::string ffindex_a3m_db_prefix;

  int c;
  while ((c = getopt(argc, argv, "i:s:o:h")) != -1) {
    switch (c) {
      case 'i':
        iflag = 1;
        ffindex_a3m_db_prefix = optarg;
        break;
      case 's':
        sflag = 1;
        set_file = optarg;
        break;
      case 'o':
        oflag = optarg;
        ffindex_oa3m_db_prefix = optarg;
        break;
      case 'h':
        usage();
        exit(0);
      case '?':
        if (optopt == 'c')
          fprintf(stderr, "Option -%c requires an argument.\n", optopt);
        else if (isprint(optopt))
          fprintf(stderr, "Unknown option `-%c'.\n", optopt);
        else
          fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt);
        return 1;
      default:
        abort();
    }
  }

  if(!iflag || !sflag || !oflag) {
    std::cerr << "Missing input!" << std::endl;
    usage();
    exit(1);
  }

  //prepare ffindex a3m output database
  std::string oa3mDataFile = ffindex_oa3m_db_prefix+".ffdata";
  std::string oa3mIndexFile = ffindex_oa3m_db_prefix+".ffindex";

  FILE *oa3m_data_fh  = fopen(oa3mDataFile.c_str(), "w");
  FILE *oa3m_index_fh = fopen(oa3mIndexFile.c_str(), "w");

  if (oa3m_data_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex ca3m data file! (" << oa3mDataFile << ")!" << std::endl;
    exit(1);
  }

  if(oa3m_index_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex ca3m index file! (" << oa3mIndexFile << ")!" << std::endl;
    exit(1);
  }

  size_t oa3m_offset = 0;

  //prepare ffindex a3m database
  std::string a3mDataFile = ffindex_a3m_db_prefix+".ffdata";
  std::string a3mIndexFile = ffindex_a3m_db_prefix+".ffindex";

  FILE *a3m_data_fh  = fopen(a3mDataFile.c_str(), "r");
  FILE *a3m_index_fh = fopen(a3mIndexFile.c_str(), "r");

  if (a3m_data_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex a3m data file! (" << a3mDataFile << ")!" << std::endl;
    exit(1);
  }

  if(a3m_index_fh == NULL) {
    std::cerr << "ERROR: Could not open ffindex a3m index file! (" << a3mIndexFile << ")!" << std::endl;
    exit(1);
  }

  size_t a3m_offset;
  char* a3m_data = ffindex_mmap_data(a3m_data_fh, &a3m_offset);
  ffindex_index_t* a3m_index = ffindex_index_parse(a3m_index_fh, 0);

  if(a3m_index == NULL) {
    std::cerr << "ERROR: A3M index could not be loaded!" << std::endl;
    exit(1);
  }

  //prepare filter
  std::set<std::string> filter;
  std::ifstream infile(set_file.c_str());

  std::string line;
  while (std::getline(infile, line)) {
    std::string item = line.substr(0, line.length());
    filter.insert(item);
  }

  infile.close();

  //prepare input stream
  size_t a3m_range_start = 0;
  size_t a3m_range_end = a3m_index->n_entries;

  // Foreach entry
  #pragma omp parallel for shared(a3m_index, a3m_data, oa3m_data_fh, oa3m_index_fh, oa3m_offset)
  for(size_t entry_index = a3m_range_start; entry_index < a3m_range_end; entry_index++)
  {
    //fprintf(stderr, "index %ld\n", entry_index);
    ffindex_entry_t* entry = ffindex_get_entry_by_index(a3m_index, entry_index);
    if(entry == NULL) { perror(entry->name); continue; }

    char* data = ffindex_get_data_by_entry(a3m_data, entry);

    std::stringstream* out_buffer = new std::stringstream();

    size_t nr_sequences = 0;

    for(size_t index = 0; index < entry->length; index++) {
      //write annotation line
      if(data[index] == '#') {
        while(data[index] != '\n' && index < entry->length) {
          out_buffer->put(data[index++]);
        }
        out_buffer->put('\n');
      }
      else if(data[index] == '>') {
        size_t start_index = index;
        while(index < entry->length && data[index] != '\n') {
          index++;
        }

        //copy line without new line
        std::string header = std::string(&data[start_index], index - start_index);
        std::string id = getNameFromHeader(header);
        bool consensus_flag = isConsensus(id);

        std::string short_id = getShortIdFromHeader(header);

        while(index < entry->length - 1 && data[index] != '>') {
          index++;
        }
        if(data[index] == '>' || data[index] == '\0') {
          index--;
        }

        bool passedFilter = false;
        if(filter.find(short_id) != filter.end()) {
          nr_sequences++;
          passedFilter = true;
        }

        if(passedFilter ||
            consensus_flag ||
            id.compare("ss_dssp") == 0 ||
            id.compare("sa_dssp") == 0 ||
            id.compare("ss_pred") == 0 ||
            id.compare("ss_conf") == 0) {
          std::string seq = std::string(&data[start_index], index - start_index);
          out_buffer->write(seq.c_str(), seq.size());
          out_buffer->put('\n');
        }
      }
    }

    if(nr_sequences > 0) {
      std::string out_string = out_buffer->str();
      #pragma omp critical
      {
        ffindex_insert_memory(oa3m_data_fh, oa3m_index_fh, &oa3m_offset, const_cast<char*>(out_string.c_str()), out_string.size(), entry->name);
      }
    }
    else {
      std::cerr << "WARNING: No sequences left for cluster " << entry->name << std::endl;
    }

    delete out_buffer;
  }

  fclose(oa3m_data_fh);
}
Esempio n. 7
0
int main(int argn, char **argv)
{
  int by_index = 0;
  static struct option long_options[] =
  {
    { "byindex", no_argument, NULL, 'n' },
    { NULL,      0,           NULL,  0  }
  };

  int opt;
  while (1)
  {
    int option_index = 0;
    opt = getopt_long(argn, argv, "n", long_options, &option_index);
    if (opt == -1)
      break;

    switch (opt)
    {
      case 'n':
        by_index = 1;
        break;
      default:
        usage(argv[0]);
        return EXIT_FAILURE;
    }
  }

  if(argn < 3)
  {
    usage(argv[0]);
    return EXIT_FAILURE;
  }
  char *data_filename  = argv[optind++];
  char *index_filename = argv[optind++];

  FILE *data_file  = fopen(data_filename,  "r");
  FILE *index_file = fopen(index_filename, "r");

  if( data_file == NULL) { fferror_print(__FILE__, __LINE__, "ffindex_get", data_filename);  exit(EXIT_FAILURE); }
  if(index_file == NULL) { fferror_print(__FILE__, __LINE__, "ffindex_get", index_filename);  exit(EXIT_FAILURE); }

  size_t data_size;
  char *data = ffindex_mmap_data(data_file, &data_size);

  ffindex_index_t* index = ffindex_index_parse(index_file, 0);
  if(index == NULL)
  {
    fferror_print(__FILE__, __LINE__, "ffindex_index_parse", index_filename);
    exit(EXIT_FAILURE);
  }

  if(by_index)
  {
    for(int i = optind; i < argn; i++)
    {
      size_t index_n = atol(argv[i]) - 1; // offset from 0 but specify from 1

      ffindex_entry_t* entry = ffindex_get_entry_by_index(index, index_n);
      if(entry == NULL)
      {
        errno = ENOENT; 
        fferror_print(__FILE__, __LINE__, "ffindex_get entry index out of range", argv[i]);
      }
      else
      {
        char *filedata = ffindex_get_data_by_entry(data, entry);
        if(filedata == NULL)
        {
          errno = ENOENT; 
          fferror_print(__FILE__, __LINE__, "ffindex_get entry index out of range", argv[i]);
        }
        else
          fwrite(filedata, entry->length - 1, 1, stdout);
      }
    }
  }
  else // by name
  {
    for(int i = optind; i < argn; i++)
    {
      char *filename = argv[i];

      ffindex_entry_t* entry = ffindex_get_entry_by_name(index, filename);
      if(entry == NULL)
      {
        errno = ENOENT; 
        fferror_print(__FILE__, __LINE__, "ffindex_get key not found in index", filename);
      }
      else
      {
        char *filedata = ffindex_get_data_by_entry(data, entry);
        if(filedata == NULL)
        {
          errno = ENOENT; 
          fferror_print(__FILE__, __LINE__, "ffindex_get key not found in index", filename);
        }
        else
          fwrite(filedata, entry->length - 1, 1, stdout);
      }
    }

      /* Alternative code using (slower) ffindex_fopen */
      /*
         FILE *file = ffindex_fopen(data, index, filename);
         if(file == NULL)
         {
         errno = ENOENT; 
         fferror_print(__FILE__, __LINE__, "ffindex_fopen file not found in index", filename);
         }
         else
         {
         char line[LINE_MAX];
         while(fgets(line, LINE_MAX, file) != NULL)
         printf("%s", line);
         }
         */
  }

  return 0;
}
Esempio n. 8
0
int main(int argn, char **argv)
{
  int sort = 0, version = 0;
  int opt, err = EXIT_SUCCESS;
  while ((opt = getopt(argn, argv, "sv")) != -1)
  {
    switch (opt)
    {
      case 's':
        sort = 1;
        break;
      case 'v':
        version = 1;
        break;
      default:
        usage(argv[0]);
        return EXIT_FAILURE;
    }
  }

  if(version == 1)
  {
    /* Don't you dare running it on a platform where byte != 8 bits */
    printf("%s version %.2f, off_t = %zd bits\n", argv[0], FFINDEX_VERSION, sizeof(off_t) * 8);
    return EXIT_SUCCESS;
  }

  if(argn - optind < 3)
  {
    usage(argv[0]);
    return EXIT_FAILURE;
  }


  char *data_filename  = argv[optind++];
  char *index_filename = argv[optind++];
  char *fasta_filename = argv[optind++];
  FILE *data_file, *index_file, *fasta_file;

  struct stat st;

  if(stat(data_filename, &st) == 0) { errno = EEXIST; perror(data_filename); return EXIT_FAILURE; }
  data_file  = fopen(data_filename, "w");
  if( data_file == NULL) { perror(data_filename); return EXIT_FAILURE; }

  if(stat(index_filename, &st) == 0) { errno = EEXIST; perror(index_filename); return EXIT_FAILURE; }
  index_file = fopen(index_filename, "w+");
  if(index_file == NULL) { perror(index_filename); return EXIT_FAILURE; }

  fasta_file = fopen(fasta_filename, "r");
  if(fasta_file == NULL) { perror(fasta_filename); return EXIT_FAILURE; }

  size_t fasta_size;
  char *fasta_data = ffindex_mmap_data(fasta_file, &fasta_size);
  size_t offset = 0;
  size_t from_length = 0;
  char name[FFINDEX_MAX_ENTRY_NAME_LENTH];
  int seq_id = 1;
  for(size_t fasta_offset = 1; fasta_offset < fasta_size; fasta_offset++) // position after first ">"
  {
    from_length = 1;
    while(fasta_offset < fasta_size && !(*(fasta_data + fasta_offset) == '>' && *(fasta_data + fasta_offset - 1) == '\n'))
    {
      fasta_offset++;
      from_length++;
    }
    sprintf(name, "%d", seq_id++);
    ffindex_insert_memory(data_file, index_file, &offset, fasta_data + (fasta_offset - from_length), from_length, name);
  }
  fclose(data_file);

  /* Sort the index entries and write back */
  if(sort)
  {
    rewind(index_file);
    ffindex_index_t* index = ffindex_index_parse(index_file, 0);
    if(index == NULL)
    {
      perror("ffindex_index_parse failed");
      exit(EXIT_FAILURE);
    }
    fclose(index_file);
    ffindex_sort_index_file(index);
    index_file = fopen(index_filename, "w");
    if(index_file == NULL) { perror(index_filename); return EXIT_FAILURE; }
    err += ffindex_write(index, index_file);
  }

  return err;
}
Esempio n. 9
0
int main(int argn, char **argv)
{
  int sort = 0, version = 0;
  int opt, err = EXIT_SUCCESS;
  int user_selected_field_index = 1;
  while ((opt = getopt(argn, argv, "svk:")) != -1)
  {
    switch (opt)
    {
      case 'k':
        user_selected_field_index = optind;
        break;
      case 's':
        sort = 1;
        break;
      case 'v':
        version = 1;
        break;
      default:
        usage(argv[0]);
        return EXIT_FAILURE;
    }
  }

  if(version == 1)
  {
    /* Don't you dare running it on a platform where byte != 8 bits */
    printf("%s version %.2f, off_t = %zd bits\n", argv[0], FFINDEX_VERSION, sizeof(off_t) * 8);
    return EXIT_SUCCESS;
  }

  if(argn - optind < 3)
  {
    usage(argv[0]);
    return EXIT_FAILURE;
  }


  char *data_filename  = argv[optind++];
  char *index_filename = argv[optind++];
  char *tsv_filename = argv[optind++];
  FILE *data_file, *index_file;
  size_t offset = 0;

  /* open ffindex */
  err = ffindex_index_open(data_filename, index_filename, "w", &data_file, &index_file, &offset);
  if(err != EXIT_SUCCESS)
    return err;

  FILE* tsv_file = fopen(tsv_filename, "r");
  if(tsv_file == NULL) { perror(tsv_filename); return EXIT_FAILURE; }

  size_t tsv_size;
  char* tsv_data = ffindex_mmap_data(tsv_file, &tsv_size);
  char* tsv_current = tsv_data;
  char* tsv_part_begin = tsv_data;
  char* tsv_last = tsv_data + tsv_size;

  char field_current[FFINDEX_MAX_ENTRY_NAME_LENTH + 1]; // + seperator
  size_t field_current_length = 0; // + seperator

  char* tsv_selected_field_start = NULL;
  size_t tsv_selected_field_length = 0;
  tsv_current = tsv_scan_line_for_field(tsv_current, user_selected_field_index, &tsv_selected_field_start, &tsv_selected_field_length);
  strncpy(field_current, tsv_selected_field_start, tsv_selected_field_length); //XXX
  field_current_length = tsv_selected_field_length;
  field_current[field_current_length] = '\0';
  while(tsv_current < tsv_last)
  {
    char* tsv_next;
    tsv_next = tsv_scan_line_for_field(tsv_current, user_selected_field_index, &tsv_selected_field_start, &tsv_selected_field_length);
    if((tsv_selected_field_length != field_current_length ||
       strncmp(field_current, tsv_selected_field_start, tsv_selected_field_length) != 0)) // XXX got a new field value
    {
      ffindex_insert_memory(data_file, index_file, &offset, tsv_part_begin, tsv_current - tsv_part_begin, field_current);
      strncpy(field_current, tsv_selected_field_start, tsv_selected_field_length); //XXX
      field_current_length = tsv_selected_field_length;
      field_current[field_current_length] = '\0';
      tsv_part_begin = tsv_current;
    }
    tsv_current = tsv_next;
  }
  ffindex_insert_memory(data_file, index_file, &offset, tsv_part_begin, tsv_current - tsv_part_begin, field_current);
  strncpy(field_current, tsv_selected_field_start, tsv_selected_field_length); //XXX
  field_current_length = tsv_selected_field_length;
  field_current[field_current_length] = '\0';

  fclose(data_file);

  /* Sort the index entries and write back */
  if(sort)
  {
    rewind(index_file);
    ffindex_index_t* index = ffindex_index_parse(index_file, 0);
    if(index == NULL)
    {
      perror("ffindex_index_parse failed");
      exit(EXIT_FAILURE);
    }
    fclose(index_file);
    ffindex_sort_index_file(index);
    index_file = fopen(index_filename, "w");
    if(index_file == NULL) { perror(index_filename); return EXIT_FAILURE; }
    err += ffindex_write(index, index_file);
  }

  return err;
}
Esempio n. 10
0
int main(int argn, char **argv)
{
  if(argn < 4)
  {
    fprintf(stderr, "USAGE: %s DATA_FILENAME INDEX_FILENAME PROGRAM [PROGRAM_ARGS]*\n"
                    "\nDesigned and implemented by Andy Hauser <*****@*****.**>.\n",
                    argv[0]);
    return -1;
  }
  char *data_filename  = argv[1];
  char *index_filename = argv[2];
  char *program_name   = argv[3];
  char **program_argv = argv + 3;

  FILE *data_file  = fopen(data_filename,  "r");
  FILE *index_file = fopen(index_filename, "r");

  if( data_file == NULL) { fferror_print(__FILE__, __LINE__, argv[0], data_filename);  exit(EXIT_FAILURE); }
  if(index_file == NULL) { fferror_print(__FILE__, __LINE__, argv[0], index_filename);  exit(EXIT_FAILURE); }

  size_t data_size;
  char *data = ffindex_mmap_data(data_file, &data_size);

  ffindex_index_t* index = ffindex_index_parse(index_file, 0);
  if(index == NULL)
  {
    fferror_print(__FILE__, __LINE__, "ffindex_index_parse", index_filename);
    exit(EXIT_FAILURE);
  }
  
  // Ignore SIGPIPE
  struct sigaction handler;
  handler.sa_handler = SIG_IGN;
  sigemptyset(&handler.sa_mask);
  handler.sa_flags = 0;
  sigaction(SIGPIPE, &handler, NULL);

  size_t range_start = 0;
  size_t range_end = index->n_entries;

  // Foreach entry
//#pragma omp parallel for
  for(size_t entry_index = range_start; entry_index < range_end; entry_index++)
  {
    //fprintf(stderr, "index %ld\n", entry_index);
    int ret = 0;
    ffindex_entry_t* entry = ffindex_get_entry_by_index(index, entry_index);
    if(entry == NULL) { perror(entry->name); continue; }

    int pipefd[2];
    ret = pipe(pipefd);
    if(ret != 0) { perror(entry->name); continue; }

    pid_t child_pid = fork();
    if(child_pid == 0)
    {
      fclose(data_file);
      fclose(index_file);
      close(pipefd[1]);

      // Make pipe from parent our new stdin
      int newfd = dup2(pipefd[0], fileno(stdin));
      if(newfd < 0) { fprintf(stdout, "%d %d\n", pipefd[0], newfd); perror(entry->name); }
      close(pipefd[0]);

      // exec program with the pipe as stdin
      execvp(program_name, program_argv);
      // never reached
    }
    else if(child_pid > 0)
    {
      // Read end is for child only
      close(pipefd[0]);

      // Write file data to child's stdin.
      char *filedata = ffindex_get_data_by_entry(data, entry);
      ssize_t written = 0;
      while(written < entry->length)
      {
        int w = write(pipefd[1], filedata + written, entry->length - written);
        if(w < 0 && errno != EPIPE)   { perror(entry->name); break; }
        else if(w == 0 && errno != 0) { perror(entry->name); break; }
        else
          written += w;
      }

      close(pipefd[1]); // child gets EOF
      waitpid(child_pid, NULL, 0);
    }
    else
    {
      perror(entry->name);
      exit(errno);
    }
  }

  return 0;
}