Example #1
0
File: main.c Project: zydeon/mosal
int main( int argc, char *argv[]) {

	if(argc != 5){
		printf("Usage: %s <seq1_file> <seq2_file> <subs_file> <mid_bounds_number>\n", argv[0]);
		return 0;
	}

	read_sequence(seq1, argv[1]);
	read_sequence(seq2, argv[2]);

	M = strlen(seq1);		N = strlen(seq2);

	init_subs_table( argv[3] );
	init_bounds( atoi(argv[4]) );
	init_dynamic_tables();
	align();

	//print scores
	int i, line;
	line = M % 2;	// determine from which line to start printing	
	for( i = 0; i < Q[line][N].num ; ++i )
		printf("%d %d\n", Q[line][N].scores[i].matches, Q[line][N].scores[i].gaps);

	remove_dynamic_tables();
	remove_bounds_tables();

	return 0;
}
/*
 * selinux_status_policyload
 *
 * It returns times of policy reloaded on the running system.
 * Note that it is not a reliable value on fallback-mode until it receives
 * the first event message via netlink socket, so, a correct usage of this
 * value is to compare it with the previous value to detect policy reloaded
 * event.
 */
int selinux_status_policyload(void)
{
	uint32_t	seqno;
	uint32_t	policyload;

	if (selinux_status == NULL) {
		errno = EINVAL;
		return -1;
	}

	if (selinux_status == MAP_FAILED) {
		if (avc_netlink_check_nb() < 0)
			return -1;

		return fallback_policyload;
	}

	/* sequence must not be changed during references */
	do {
		seqno = read_sequence(selinux_status);

		policyload = selinux_status->policyload;

	} while (seqno != read_sequence(selinux_status));

	return policyload;
}
/*
 * selinux_status_getenforce
 *
 * It returns the current performing mode of SELinux.
 * 1 means currently we run in enforcing mode, or 0 means permissive mode.
 */
int selinux_status_getenforce(void)
{
	uint32_t	seqno;
	uint32_t	enforcing;

	if (selinux_status == NULL) {
		errno = EINVAL;
		return -1;
	}

	if (selinux_status == MAP_FAILED) {
		if (avc_netlink_check_nb() < 0)
			return -1;

		return fallback_enforcing;
	}

	/* sequence must not be changed during references */
	do {
		seqno = read_sequence(selinux_status);

		enforcing = selinux_status->enforcing;

	} while (seqno != read_sequence(selinux_status));

	return enforcing ? 1 : 0;
}
Example #4
0
void cncEnvIn(int argc, char **argv, Context *context) {
    CNC_REQUIRE(argc == 5, "Usage: %s fileName1 fileName2 tileWidth tileHeight\n", argv[0]);

    // Open sequence input files
    FILE *file1 = open_file(argv[1]);
    FILE *file2 = open_file(argv[2]);
    size_t filesize1 = file_length(file1);
    size_t filesize2 = file_length(file2);

    // Allocate tile data item and read sequence data
    SeqData *data;
    size_t dataSize = sizeof(SeqData) + filesize1 + filesize2 + 2;
    cncHandle_t dataHandle = cncCreateItemSized_data(&data, dataSize);
    data->seq2offset = filesize1 + 1;
    size_t length1 = read_sequence(file1, 1, SEQ1(data), filesize1);
    size_t length2 = read_sequence(file2, 2, SEQ2(data), filesize2);

    // Tile width and height
    int tw = atoi(argv[3]);
    int th = atoi(argv[4]);
    PRINTF("Tile width:  %d\n", tw);
    PRINTF("Tile height: %d\n", th);
    CNC_REQUIRE(tw <= length1 && th <= length2, "Tile size too large for given input.\n");

    // Number of tiles wide and high
    int ntw = length1 / tw;
    int nth = length2 / th;
    PRINTF("Imported %d x %d tiles.\n", ntw, nth);

    // Initialize tile dimension data and put
    data->tw = tw;
    data->th = th;
    data->ntw = ntw;
    data->nth = nth;
    memcpy(data->score_matrix, ALIGNMENT_SCORES, sizeof(ALIGNMENT_SCORES));
    cncPut_data(dataHandle, 0, context);

    // Record starting time
    struct timeval *startTime;
    cncHandle_t startTime_handle = cncCreateItem_startTime(&startTime, 1);
    gettimeofday(startTime, 0);
    cncPut_startTime(startTime_handle, 0, context);

    // Seed edges
    cncPrescribe_initAboveStep(tw, ntw, context);
    cncPrescribe_initLeftStep(th, nth, context);

    int i, j;
    for(i = 0; i < nth; i++){
        for(j = 0; j < ntw; j++){
            cncPrescribe_swStep(i, j, context);
        }
    }

    cncPrescribe_cncEnvOut(ntw, nth, tw, context);
}
Example #5
0
int read_line (FILE* f, INPUT* input, int max_sequence) {
    int code = read_sequence(f, &input->X, &input->X_length, max_sequence, END_SEQUENCE);

    if(code)
        return 0;

    read_sequence(f, &input->Y, &input->Y_length, max_sequence, END_PAIR);

    return 1;
}
  void read_fastq(stream_status& st, sequence_ptr& buff) {
    size_t read = 0;
    if(st.have_seam) {
      memcpy(buff.start, st.seam, mer_len_ - 1);
      read = mer_len_ - 1;
    }

    // Here, the st.stream is assumed to always point to some
    // sequence (or EOF). Never at header.
    while(st.stream->good() && read < buf_size_ - mer_len_ - 1) {
      size_t nread  = read_sequence(*st.stream, read, buff.start, '+');
      read         += nread;
      st.seq_len   += nread;
      if(st.stream->peek() == '+') {
        skip_quals(*st.stream, st.seq_len);
        if(st.stream->good()) {
          *(buff.start + read++) = 'N'; // Add N between reads
          ignore_line(*st.stream); // Skip sequence header
          ++reads_read_;
        }
        st.seq_len = 0;
      }
    }
    buff.end = buff.start + read;

    st.have_seam = read >= (size_t)(mer_len_ - 1);
    if(st.have_seam)
      memcpy(st.seam, buff.end - mer_len_ + 1, mer_len_ - 1);
  }
Example #7
0
int test_case1()
{	
	char* file_names[] = 
	{
		"/home/guoqiang/code/0.0.2/MediaSvr/02A805D023585316849992F61018FD8A360239D6.dat",
		"/home/guoqiang/code/NewMediaServer/new_ms/MediaSvr/02A805D023585316849992F61018FD8A360239D6.dat",
		"/home/guoqiang/code/NewMediaServer/new_ms/MediaSvr/14C39AB98D3205AC196308FAA27B3485BCB7109C.dat",
		"/home/guoqiang/code/NewMediaServer/new_ms1/MediaSvr/02A805D023585316849992F61018FD8A360239D6.dat",
		"/home/guoqiang/code/NewMediaServer/new_ms1/MediaSvr/C102015F1FD646DC1534F8244C4C4A6A2AF0C425.dat",
		"/home/guoqiang/code/NewMediaServer/new_ms1/MediaSvr/14C39AB98D3205AC196308FAA27B3485BCB7109C.dat",
		"/home/guoqiang/code/NewMediaServer/xiongm/MediaSvr/02A805D023585316849992F61018FD8A360239D6.dat",
		"/home/guoqiang/code/NewMediaServer/xiongm/MediaSvr/C102015F1FD646DC1534F8244C4C4A6A2AF0C425.dat",
		"/home/guoqiang/code/NewMediaServer/xiongm/MediaSvr/14C39AB98D3205AC196308FAA27B3485BCB7109C.dat",
		"/home/html/02A805D023585316849992F61018FD8A360239D6.dat",
		"/home/html/C102015F1FD646DC1534F8244C4C4A6A2AF0C425.dat",
		"/home/html/B421F1690EDF4115D39A0531770FF1D46B57F9FC.dat",
	};

	int index = 0;
	for(index=0; index<(int)(sizeof(file_names)/sizeof(file_names[0])); index++)
	{
		read_sequence(file_names[index]);
	}

	return 0;
}
Example #8
0
extern int read_motifs (
  AjPSeqall fdata,			/* opened dataset file */
  char *filename,			/* motif file */
  MOTIF motif[NMOTIFS],			/* motif info */
  BOOLEAN save_dataset,			/* return dataset in memory */
  DATASET *dataset			/* integer-encoded dataset */
)
{
  int i, nmotifs;
  FILE *fptr;
  char seq_name[MSN+1];
  HASH_TABLE ht_seq_names;		/* hash of dataset seq names */
  char *sample_name;			/* name of sample from dataset */
  char *id;				/* id of sample from dataset */
  char *sequence;			/* sample from dataset */
  int length;				/* length of sample from dataset */
  int seq_no = 0;			/* number of sequences in dataset */

  /* create a hash table of sequence names */
  ht_seq_names = hash_create(DATA_HASH_SIZE);

  /* hash the names in the dataset 
     so that motif files can be checked for bad ones; column is set to 0 
  */
  while (read_sequence(fdata, &sample_name, &id, &sequence, &length)) {
    /* skip sequence if there was an error */
    if (length < 0) continue;

    if (hash_lookup(sample_name, 0, ht_seq_names)) {
      /* printf("Duplicate sequence: %s\n", sample_name); */
      myfree(sample_name);
      myfree(id);
      myfree(sequence);
      continue; 
    }
    hash_insert(sample_name, 0, ht_seq_names);	/* put name in hash table */
    if (save_dataset) {
      /* create a sample and put the sample in the array of samples */
      if ((seq_no % RCHUNK) == 0) {
	Resize(dataset->samples,
	  (seq_no + RCHUNK) * (int) sizeof(SAMPLE *), SAMPLE *);
      }
      dataset->samples[seq_no] = (SAMPLE *) malloc(sizeof(SAMPLE));
      dataset->samples[seq_no]->sample_name = sample_name;
      dataset->samples[seq_no]->length = length;
      /* encode the sequence as integers */
      for (i=0; i < length; ++i) {
	int c = (int) sequence[i];
	int e = hash(c);
	if (e == -1) {
	  printf("\nIllegal character %c in sequence %s.  ", c, sample_name);
	  printf("Change alphabet or fix data file.\n");
	  return 0;
	}
        sequence[i] = e;
      }
      dataset->samples[seq_no]->res = sequence;
      dataset->n_samples = ++seq_no;
      /*printf("\r%s", sample_name);*/
    } else {
Example #9
0
int main(int argc, char **argv) {
    struct sequence *seq;

    parse_args(argc, argv);
    detect_columns();
    seq = read_sequence();
    render_sequence(seq,opt_columns,opt_rows);
    return 0;
}
Example #10
0
/*
    De-replicates the fasta file fasta_fp against the de-replication
    database db

    Inputs:
        fasta_fp: fasta filepath
        db: pointer to the de-replication database
*/
void _serial_dereplication(char* fasta_fp, derep_db* db){
    // Open the FASTA file
    FILE* fd = fopen(fasta_fp, "r");
    // Check if we were able to open the file
    if(fd == NULL)
        error_handler(FATAL_ERROR, "Error opening file %s", fasta_fp);
    // Read the first sequence
    sequence* seq = read_sequence(fd);
    // Loop through all the file
    while(seq != NULL){
        // Compare if the sequence already exist on the DB
        dereplicate_db(db, seq);
        // Read next sequence
        seq = read_sequence(fd);
    }
    // Close the FASTA file
    fclose(fd);
}
Example #11
0
File: main.c Project: zydeon/mosal
int main( int argc, char *argv[]) {

	if(argc != 3){
		printf("Usage: %s <seq1_file> <seq2_file>\n", argv[0]);
		return 0;
	}
	
	read_sequence(seq1, argv[1]);
	read_sequence(seq2, argv[2]);

	M = strlen(seq1); N = strlen(seq2);
	K = N+M-2*lcs_length();

	init_dynamic_tables();
	align();
	remove_dynamic_tables();

	return 0;
}
int main(int argc, char **argv) {
  char *A = read_sequence(argv[1]);
  char *B = read_sequence(argv[2]);
  FILE *sm_fp = fopen(argv[3], "r");

  ScoreMatrix *sm = scoreMatrixCreate(sm_fp);

  int M = strlen(A);
  int N = strlen(B);

  if (N > M) {
    alignment(A, B, M, N, sm);
  } else {
    alignment(B, A, N, M, sm);
  }
  free(A);
  free(B);
  
  return EXIT_SUCCESS;
}
Example #13
0
SeqType
read_fasta(const std::string& filename, const std::string& name)
{
  std::string line, begin_contig=">"+name;
  SeqType sequence;

  // open file
  std::ifstream fastafile ( filename.c_str() , std::ifstream::in );

  if (!fastafile.is_open()) {
    std::stringstream ss;

    ss<< "Fasta file "<< filename << " cannot be opened.";

    throw std::domain_error(ss.str().c_str());
  }

  // find contig in multifasta
  do {
    getline(fastafile, line);

    if (fastafile.eof()) {
      std::stringstream s;
      s << "Sequence of \"" << name << "\" not found.";
      throw std::invalid_argument(s.str());
    }

  } while (line!=begin_contig);


  /*
  // get first contig line
  getline(fastafile, line);

  // while we do not reach a new contig
  while ((!fastafile.eof())&&(line.substr(0,1) != ">")) {

    // move read bases into sequence
    unsigned int filled_until=sequence.size();
    sequence.resize(sequence.size()+line.size());
    for (unsigned int i=0; i<line.size(); i++) {
      sequence[i+filled_until]=line[i];
    }

    // read a new line
    getline(fastafile, line);
  }
  */

  sequence=read_sequence(fastafile);

  fastafile.close();
  return sequence;
}
Example #14
0
/*
    Reads the sequence number `idx` present in the file pointed by fd

    Returns a pointer to the read sequence structure or NULL if no more
    sequences are present on the file or the sequence has been already read
*/
sequence* read_sequence_by_idx(FILE *fd, int idx){
    // Check that we don't have already read the sequence
    if(CURR_SEQ > idx)
        return NULL;
    // Skip sequences until the one we have to read
    sequence* seq;
    do{
        seq = read_sequence(fd);
    } while(CURR_SEQ <= idx && seq != NULL);
    // Return the sequence to read
    return seq;
}
/*
 * selinux_status_deny_unknown
 *
 * It returns a guideline to handle undefined object classes or permissions.
 * 0 means SELinux treats policy queries on undefined stuff being allowed,
 * however, 1 means such queries are denied.
 */
int selinux_status_deny_unknown(void)
{
	uint32_t	seqno;
	uint32_t	deny_unknown;

	if (selinux_status == NULL) {
		errno = EINVAL;
		return -1;
	}

	if (selinux_status == MAP_FAILED)
		return security_deny_unknown();

	/* sequence must not be changed during references */
	do {
		seqno = read_sequence(selinux_status);

		deny_unknown = selinux_status->deny_unknown;

	} while (seqno != read_sequence(selinux_status));

	return deny_unknown ? 1 : 0;
}
Example #16
0
File: main.c Project: zydeon/mosal
int main( int argc, char *argv[]) {

	srand( time(NULL) );	/* random seed to choose one score between equal score vectores */

	if(argc != 4){
		printf("Usage: %s <seq1_file> <seq2_file> <mid_bounds_number>\n", argv[0]);
		return 0;
	}

	read_sequence(seq1, argv[1]);
	read_sequence(seq2, argv[2]);

	M = strlen(seq1);		N = strlen(seq2);

	init_bounds( atoi(argv[3]) );
	init_tables();
	align();
	traceback();

	remove_tables();
	remove_bounds_tables();

	return 0;
}
Example #17
0
int main (){
	
	int *vec;
	int num;
	int x;
	int i;
	int finalQnt = 0;

	printf("Quanto numeros gostaria de inserir?\n");
	scanf("%d",&num);
	printf("Quantidade de numeros inseridos : %d\n", num);
	vec = malloc(num*sizeof(int));
	
	read_sequence(vec,num);
	finalQnt = three_sum_brutal_force(vec,num);
	printf("Quantidade encontrada : %d\n", finalQnt);
	free (vec);
	return 0;
}
Example #18
0
int main(int argc, char *argv[]){
  FILE *fin,*fout;
  char *sequence=NULL, *seqname=NULL;
  int comp,reve,verbose, width;
  long unsigned length;

  interface(argc, argv, &verbose, &fin, &fout, &reve, &comp, &width);
  while(fgetc(fin)!='>') ;   /* rewind */

  while(!feof(fin)){
    seqname=read_seqname(fin, seqname);
    sequence=read_sequence(fin, sequence);
    
    length=strlen(sequence);
    if (comp==1) make_complement_sequence_of(sequence, length);
    if (reve==1)    make_reverse_sequence_of(sequence, length);

    output(fout,seqname,sequence, verbose, width);
  }
  
  return 0;
}
  void read_fasta(stream_status& st, sequence_ptr& buff) {
    size_t read = 0;
    if(st.have_seam) {
      memcpy(buff.start, st.seam, mer_len_ - 1);
      read = mer_len_ - 1;
    }

    // Here, the current stream is assumed to always point to some
    // sequence (or EOF). Never at header.
    while(st.stream->good() && read < buf_size_ - mer_len_ - 1) {
      read += read_sequence(*st.stream, read, buff.start, '>');
      if(st.stream->peek() == '>') {
        *(buff.start + read++) = 'N'; // Add N between reads
        ignore_line(*st.stream); // Skip to next sequence (skip headers, quals, ...)
        ++reads_read_;
      }
    }
    buff.end = buff.start + read;

    st.have_seam = read >= (size_t)(mer_len_ - 1);
    if(st.have_seam)
      memcpy(st.seam, buff.end - mer_len_ + 1, mer_len_ - 1);
  }
/*
 * selinux_status_updated
 *
 * It returns whether something has been happened since the last call.
 * Because `selinux_status->sequence' shall be always incremented on
 * both of setenforce/policyreload events, so differences from the last
 * value informs us something has been happened.
 */
int selinux_status_updated(void)
{
	uint32_t	curr_seqno;
	int		result = 0;

	if (selinux_status == NULL) {
		errno = EINVAL;
		return -1;
	}

	if (selinux_status == MAP_FAILED) {
		if (avc_netlink_check_nb() < 0)
			return -1;

		curr_seqno = fallback_sequence;
	} else {
		curr_seqno = read_sequence(selinux_status);
	}

	/*
	 * `curr_seqno' is always even-number, so it does not match with
	 * `last_seqno' being initialized to odd-number in the first call.
	 * We never return 'something was updated' in the first call,
	 * because this function focuses on status-updating since the last
	 * invocation.
	 */
	if (last_seqno & 0x0001)
		last_seqno = curr_seqno;

	if (last_seqno != curr_seqno)
	{
		last_seqno = curr_seqno;
		result = 1;
	}
	return result;
}
Example #21
0
/**
 * @brief HTTP::get_line
 * @return string with retrieved line
 */
std::string HTTP::get_line()
{
    std::vector<char> buffer;
    std::string a = read_sequence(buffer, "\r\n"); // reads until endline sequence
    return a;
}
DATASET *read_seq_file(
  char *file_name,		/* name of file to open */
  char *alpha,			/* alphabet used in sequences */
  BOOLEAN use_comp,		/* use complementary strands, too */
  double seqfrac 		/* fraction of input sequences to use */
)
{
  int i, j;
  FILE *data_file;		/* file with samples to read */
  FILE *prior_file=NULL;	/* file with positional priors to read */
  char *sample_name;		/* name of sample read */
  char *sample_de;		/* descriptor text for sample */
  char *sequence;		/* sequence read */
  long length;			/* length of sequence */
  BOOLEAN error=FALSE;		/* none yet */
  SAMPLE *sample;		/* sample created */
  DATASET *dataset;		/* dataset created */
  int n_samples=0;		/* number of samples read */
  double *seq_weights=NULL;	/* sequence weights */
  int n_wgts=0;			/* number of sequence weights given */

  /* create a hash table of sequence names */
  if (!ht_seq_names) ht_seq_names = hash_create(DATA_HASH_SIZE);

  /* create a dataset */
  dataset = (DATASET *) mymalloc(sizeof(DATASET));
  dataset->alength = strlen(alpha);
  dataset->alphabet = alpha;
  dataset->psp_w = 0;			// indicates no PSP was read
  dataset->log_psp_w = 0;		// so log_psp will get initialized

  /* open data file */
  if (file_name == NULL) {
    fprintf(stderr, "You must specify a data file or `stdin'.\n");
    exit(1);
  } else if (strcmp(file_name, "stdin")) {
    data_file = fopen(file_name, "r"); 
    if (data_file == NULL) {
      fprintf(stderr, "Cannot open file `%s'.\n", file_name);
      exit(1);
    }
  } else {
    data_file = stdin;
  }

  /* initialize maximum length of sequences */
  dataset->max_slength = 0;
  dataset->min_slength = 10000000;

  dataset->n_samples = 0;	/* no samples yet */
  dataset->samples = NULL;	/* no samples */

  while (read_sequence(data_file, &sample_name, &sample_de, &sequence, 
    &length)) {

    /* skip sequence if an error occurred */
    if (length < 0) continue;

    /* parse weights if given; make (more than enough) room in array */
    if (strcmp(sample_name, "WEIGHTS")==0) {
      double wgt; 
      char *wgt_str = sample_de;
      Resize(seq_weights, n_wgts+(int)strlen(wgt_str), double);
      while (sscanf(wgt_str, "%lf", &wgt) == 1) {
        if (wgt <= 0 || wgt > 1) {
	  fprintf(stderr, 
            "Weights must be larger than zero and no greater than 1.\n");
	  exit(1);
        }
        seq_weights[n_wgts++] = wgt;			/* save weight */
        wgt_str += strspn(wgt_str, "      ");		/* skip white */
        wgt_str += strcspn(wgt_str, "     ");		/* skip token */
      }
      myfree(sample_name);
      myfree(sample_de);
      myfree(sequence);
      continue;
    }

    /* ignore duplicate (same sample name) sequences */ 
    if (hash_lookup_str(sample_name, ht_seq_names) != NULL) {
      fprintf(stderr, "Skipping sequence '%s'.\n", sample_name);
      myfree(sample_name);
      myfree(sample_de);
      myfree(sequence);
      continue;
    }
    hash_insert_str(sample_name, ht_seq_names);  /* put name in hash table */

    n_samples++;

    /* see if sequence will be used in random sample; store it if yes */
    if (drand48() >= 1 - seqfrac) {

      HASH_TABLE_ENTRY *hash_entry; // needed to add pointer to sample

      /* create the sample */
      sample = create_sample(alpha, length, sample_name, sequence, sample_de, use_comp);
      if (sample == NULL) {error = TRUE; continue;}

      /* record maximum length of actual sequences */
      dataset->max_slength = MAX(sample->length, dataset->max_slength);
      dataset->min_slength = MIN(sample->length, dataset->min_slength);

      /* put the sample in the array of samples */
      if ((dataset->n_samples % RCHUNK) == 0) {
        Resize(dataset->samples, dataset->n_samples + RCHUNK, SAMPLE *);
      }
Example #23
0
static int train(
        const char *train_filename,
        const char *tune_filename,
        const char *model_filename)
{
    FILE *train_file = fopen(train_filename, "rb");
    FILE *tune_file = fopen(tune_filename, "rb");

    if (train_file == NULL) {
        perror("unable to open training data file");
        exit(1);
    }

    if (tune_file == NULL) {
        perror("unable to open tuning data file");
        exit(1);
    }

    size_t i;

    const size_t max_items = 0x400;
    const size_t max_fields = max_items*N_TRAIN_FIELDS;
    const size_t max_len = 0x10000;

    uint8_t *field_buf[max_fields];
    size_t field_len[max_fields];
    size_t n_items;
    uint8_t buf[max_len];

    size_t weights_len = 0x1000000;
    real *weights = malloc(sizeof(real)*weights_len);
    // Even elements contain averages of the weights vector,
    // odd elements contain the time (i.e. value of t) of the last update of
    // the average.
    // This is less elegant, but good for cache locality since these values
    // are accessed randomly at the same time.
    // TODO: might also want to get the weight vector itself into the same
    // area.
    double *average_weights = malloc(sizeof(double)*weights_len*2);
    double t = 0.0;

    for (i=0; i<weights_len; i++) weights[i] = (real)0.0;
    for (i=0; i<weights_len*2; i++) average_weights[i] = 0.0;

    size_t iter;
    double tune_error_avg = 1.0;
    double best_error = 1.0;

    // First, get file offsets of sentence starts
    size_t max_sents = 0x100000;
    size_t n_sents;
    long *sent_offsets = malloc(sizeof(long)*max_sents);

    for (n_sents=0; ; n_sents++) {
        if (n_sents >= max_sents) {
            return 1;
        }
        sent_offsets[n_sents] = ftell(train_file);
        n_items = max_items;
        size_t buf_len = max_len;
        const int rv = read_sequence(
                train_file, field_buf, field_len, N_TRAIN_FIELDS, &n_items,
                buf, &buf_len);
        if (rv < 0) {
            if (!feof(train_file)) {
                fprintf(stderr, "Error at %s:%ld (bytes)!\n",
                        train_filename, ftell(train_file));
                return 1;
            }
            rewind(train_file);
            break;
        }
    }

    sent_offsets = realloc(sent_offsets, sizeof(long)*n_sents);
    size_t sent_order[n_sents];
    for (i=0; i<n_sents; i++) sent_order[i] = i;

    fprintf(stderr, "Training data contains %zd sentences\n", n_sents);

    double tune_error = 1.0;
    for (iter=0; ; iter++) {
        shuffle(sent_order, n_sents);

        fprintf(stderr, "Iteration %zd...\n", iter+1);
        size_t n_errs = 0;
        size_t n_total = 0;
        size_t sent;
        for (sent=0; sent<n_sents; sent++) {
            fseek(train_file, sent_offsets[sent_order[sent]], SEEK_SET);

            n_items = max_items;
            size_t buf_len = max_len;
            const int rv = read_sequence(
                    train_file, field_buf, field_len, N_TRAIN_FIELDS, &n_items,
                    buf, &buf_len);
            if (rv < 0) {
                fprintf(stderr, "Error at %s:%ld (bytes)!\n",
                        train_filename, ftell(train_file));
                return 1;
            }

            label gold[n_items];
            for (i=0; i<n_items; i++) {
                const int tag = tagset_from_str(
                        (const char*)field_buf[i*N_TRAIN_FIELDS + COL_TAG]);
                if (tag < 0) {
                    fprintf(stderr, "Invalid tag: '%s'\n",
                            field_buf[i*N_TRAIN_FIELDS + COL_TAG]);
                }
                gold[i] = tag;
            }

            n_total += n_items;
            n_errs += train_sequence(
                    (const uint8_t**)field_buf, field_len, N_TRAIN_FIELDS,
                    n_items, weights, weights_len, gold, t, average_weights);
            t += 1.0;
        }

        fprintf(stderr, "  Training error: %.2f%%\n",
                100.0*(double)n_errs/(double)n_total);

        for (i=0; i<weights_len; i++) {
            average_weights[i*2] +=
                (t - average_weights[i*2+1]) * (double)weights[i];
            average_weights[i*2+1] = t;
        }

        tune_error = 1.0;

        real *real_average_weights = malloc(sizeof(real)*weights_len);
        for (i=0; i<weights_len; i++)
            real_average_weights[i] = (real)average_weights[i*2];
        tag(tune_file, real_average_weights, weights_len, NULL, 
            N_TRAIN_FIELDS, &tune_error);
        free(real_average_weights);
        rewind(tune_file);

        fprintf(stderr, "  Tuning error:   %.2f%%\n", 100.0*tune_error);
        if (tune_error < best_error) best_error = tune_error;
        if (iter == 0) {
            tune_error_avg = tune_error;
        } else {
            if (tune_error > 0.99*tune_error_avg) break;
            tune_error_avg = tune_error_avg*0.5 + tune_error*0.5;
        }
    }

    fclose(train_file);

    for (i=0; i<weights_len; i++)
        weights[i] = (real)average_weights[i*2];

    free(average_weights);

    fprintf(stderr, "Finding optimal feature compression...\n");

    if (tune_error > best_error) best_error = tune_error;

    size_t compression;
    for (compression=2; ; compression*=2) {
        real *folded_weights = malloc(sizeof(real)*weights_len/2);
        for (i=0; i<weights_len/2; i++)
            folded_weights[i] = weights[i] + weights[weights_len/2 + i];
        double tune_error = 1.0;
        tag(tune_file, folded_weights, weights_len/2, NULL, N_TRAIN_FIELDS,
            &tune_error);
        fprintf(stderr, "  %zdx compression tuning error: %.2f%%\n",
                compression, 100.0*tune_error);
        if (tune_error > 1.01 * best_error) {
            free(folded_weights);
            fprintf(stderr, "Selected %zdx compression: 0x%zx features\n",
                    compression/2, weights_len);
            break;
        }
        free(weights);
        weights = folded_weights;
        weights_len /= 2;
        rewind(tune_file);
        if (tune_error < best_error) best_error = tune_error;
    }

    fclose(tune_file);

    FILE *model = fopen(model_filename, "wb");
    if (model == NULL) {
        perror("unable to open model file for writing");
        exit(1);
    }
    fwrite(weights, sizeof(real), weights_len, model);
    fclose(model);

    free(weights);

    return 0;
}
Example #24
0
extern DATASET *read_seq_file(
  AjPSeqall seqa,		/* name of file to open */
  char *alpha,			/* alphabet used in sequences */
  BOOLEAN use_comp,		/* use complementary strands, too */
  double seqfrac		/* fraction of input sequences to use */
)
{
  int i, j, pcol;
  /*FILE *data_file;*/		/* file with samples to read */
  char *sample_name;		/* name of sample read */
  char *sample_de;		/* descriptor text for sample */
  char *sequence;		/* sequence read */
  int length;			/* length of sequence */
  BOOLEAN error=FALSE;		/* none yet */
  SAMPLE *sample;		/* sample created */
  DATASET *dataset;		/* dataset created */
  int n_samples=0;		/* number of samples read */
  double *seq_weights=NULL;	/* sequence weights */
  int n_wgts=0;			/* number of sequence weights given */

  /* create a hash table of sequence names */
  if (!ht_seq_names) ht_seq_names = hash_create(DATA_HASH_SIZE);

  /* create a dataset */
  dataset = (DATASET *) malloc(sizeof(DATASET));
  dataset->alength = strlen(alpha);
  dataset->alphabet = alpha;
  dataset->pal = 0;		/* not looking for palindromes */


  /* initialize maximum length of sequences */
  dataset->max_slength = 0;
  dataset->min_slength = 10000000;

  dataset->n_samples = 0;	/* no samples yet */
  dataset->samples = NULL;	/* no samples */

  while (read_sequence(seqa, &sample_name, &sample_de, &sequence, 
    &length)) {

    /* skip sequence if an error occurred */
    if (length < 0) continue;

    /* parse weights if given; make (more than enough) room in array */
    if (strcmp(sample_name, "WEIGHTS")==0) {
      double wgt; 
      char *wgt_str = sample_de;
      Resize(seq_weights, n_wgts+strlen(wgt_str), double);
      while (sscanf(wgt_str, "%lf", &wgt) == 1) {
        if (wgt <= 0 || wgt > 1) {
	  fprintf(stderr, 
            "Weights must be larger than zero and no greater than 1.\n");
	  exit(1);
        }
        seq_weights[n_wgts++] = wgt;			/* save weight */
        wgt_str += strspn(wgt_str, "      ");		/* skip white */
        wgt_str += strcspn(wgt_str, "     ");		/* skip token */
      }
      myfree(sample_name);
      myfree(sample_de);
      myfree(sequence);
      continue;
    }

    /* ignore duplicate (same sample name) sequences */ 
    if (hash_lookup(sample_name, 0, ht_seq_names)) {
      fprintf(stderr, "Skipping %s\n", sample_name);
      myfree(sample_name);
      myfree(sample_de);
      myfree(sequence);
      continue;
    }
    hash_insert(sample_name, 0, ht_seq_names);  /* put name in hash table */

    n_samples++;

    /* see if sequence will be used in random sample; store it if yes */
    if (drand48() >= 1 - seqfrac) {

      /* create the sample */
      sample = create_sample(alpha, length, sample_name, sequence);
      if (sample == NULL) {error = TRUE; continue;}

      /* record maximum length of actual sequences */
      dataset->max_slength = MAX(sample->length, dataset->max_slength);
      dataset->min_slength = MIN(sample->length, dataset->min_slength);

      /* put the sample in the array of samples */
      if ((dataset->n_samples % RCHUNK) == 0) {
        Resize(dataset->samples, dataset->n_samples + RCHUNK, SAMPLE *);
      }
Example #25
0
void main (int argc, char **argv, char **env)
{
  /* Set-up */
  int I, M, T;                /* states, observations, sequence length */
  int *X;                     /* integer array */

  double **alpha, **beta, **gamma, ***xi;

  DHMM posterior, prior;
  FILE *PRIOR, *DATA, *POSTERIOR;

  unsigned long seed;            /* pseudorandom generator seed */
  
  /* Initialize */
  /* To do:  1. Put in argument-checking code here. */

  set_seed(&seed, 0);

  /* read in the prior */
  PRIOR = fopen(argv[1], "r");
  read_DHMM(PRIOR, &prior);
  fclose(PRIOR);  

  /* read in the data */
  DATA = fopen(argv[2], "r");
  read_sequence(DATA, &T, &X);
  fclose(DATA);

  /* allocate working space */
  I = prior.I;
  M = prior.M;

  new_DHMM(&posterior, I, M);

  alpha = array_double_2d(1, I, 1, T);
  beta  = array_double_2d(1, I, 1, T);
  gamma = array_double_2d(1, I, 1, T);
  xi    = array_double_3d(1, I, 1, I, 1, T);

  /* Process */

  /* we must initialize the posterior mode or the algorithm */
  /* will not work correctly */
  initialize_DHMM(&seed, &posterior);

  /* run the penalized maximum likelihood algorithm and write results */
  Penalized_Baum_Welch(&posterior, &prior, T, X, alpha, beta, gamma, xi);

  /* write out the posterior */
  POSTERIOR = fopen(argv[3], "w");
  write_DHMM(POSTERIOR, &posterior);  
  fclose(POSTERIOR);

  /* Clean-up */
  save_seed(&seed);

  free_array_double_2d(alpha, 1, I, 1, T);
  free_array_double_2d( beta, 1, I, 1, T);
  free_array_double_2d(gamma, 1, I, 1, T);
  free_array_double_3d(   xi, 1, I, 1, I, 1, T);
  free_array_int_1d(    X, 1, T);
  free_DHMM(&posterior);
  free_DHMM(&prior);
}
Example #26
0
void main (int argc, char **argv, char **env)
{
  /* Start-up */
  int I, M, T;     /* states, observation types, sequence length */
  int *X;          /* integer array */
  int i, t;        /* indices */

  double *scale, **alpha, **beta, **gamma, ***xi;

  DHMM prior, star;
  FILE *DATA, *PRIOR, *SMOOTH;

  /* Initialize */
  /* To do:  1. Put in argument-checking code here. */
  
  /* read in the prior */
  PRIOR = fopen(argv[1], "r");
  read_DHMM(PRIOR, &prior);
  fclose(PRIOR);  

  /* read in the data */
  DATA = fopen(argv[2], "r");
  read_sequence(DATA, &T, &X);
  fclose(DATA);

  /* Debug code */
  /* fprintf(stdout, "T: %d and X[T]: %d\n", T, X[T]); */

  /* allocate working space */
  I = prior.I;
  M = prior.M;

  alpha = array_double_2d(1, I, 1, T);
  beta  = array_double_2d(1, I, 1, T);
  gamma = array_double_2d(1, I, 1, T);
  xi    = array_double_3d(1, I, 1, I, 1, T);

  new_DHMM(&star, I, M);                   /* initialize the star matrix */
  scale = array_double_1d(1, T);           /* initialize scaling array */

  /* Process */
  
  /* We use the fact that the normalized gamma and xi arrays
   * contain respectively the posterior probabilities of
   * states and observations.
   */

  /* M-step */
  Calculate_star(&prior, &star);

  /* E-step */
  /* We do not want to rescale the xi and gamma arrays here */
  Forward_Scaled(&star, T, X, alpha, scale);
  Backward_Scaled(&star, T, X, beta, scale);

  Accumulate_gamma(&star, T, X, alpha, beta, gamma);

  /* To do:  1.  Put the above and the below into separate utility
   *             routines.
   */

  /* write results */
  SMOOTH = fopen(argv[3], "w");

  for (t=1; t<=T; t++) {
    for (i=1; i<=I; i++) {
      fprintf(SMOOTH, "%12.3f ", gamma[i][t]);
    }
    fprintf(SMOOTH, "\n");
  }

  fclose(SMOOTH);

  /* Clean-up */

  free_array_double_1d(scale, 1, T);
  free_array_double_2d(alpha, 1, I, 1, T);
  free_array_double_2d( beta, 1, I, 1, T);
  free_array_double_2d(gamma, 1, I, 1, T);
  free_array_double_3d(   xi, 1, I, 1, I, 1, T);
  free_array_int_1d(    X, 1, T);
  free_DHMM(&prior);
  free_DHMM(&star);
}