int main( int argc, char *argv[]) { if(argc != 5){ printf("Usage: %s <seq1_file> <seq2_file> <subs_file> <mid_bounds_number>\n", argv[0]); return 0; } read_sequence(seq1, argv[1]); read_sequence(seq2, argv[2]); M = strlen(seq1); N = strlen(seq2); init_subs_table( argv[3] ); init_bounds( atoi(argv[4]) ); init_dynamic_tables(); align(); //print scores int i, line; line = M % 2; // determine from which line to start printing for( i = 0; i < Q[line][N].num ; ++i ) printf("%d %d\n", Q[line][N].scores[i].matches, Q[line][N].scores[i].gaps); remove_dynamic_tables(); remove_bounds_tables(); return 0; }
/* * selinux_status_policyload * * It returns times of policy reloaded on the running system. * Note that it is not a reliable value on fallback-mode until it receives * the first event message via netlink socket, so, a correct usage of this * value is to compare it with the previous value to detect policy reloaded * event. */ int selinux_status_policyload(void) { uint32_t seqno; uint32_t policyload; if (selinux_status == NULL) { errno = EINVAL; return -1; } if (selinux_status == MAP_FAILED) { if (avc_netlink_check_nb() < 0) return -1; return fallback_policyload; } /* sequence must not be changed during references */ do { seqno = read_sequence(selinux_status); policyload = selinux_status->policyload; } while (seqno != read_sequence(selinux_status)); return policyload; }
/* * selinux_status_getenforce * * It returns the current performing mode of SELinux. * 1 means currently we run in enforcing mode, or 0 means permissive mode. */ int selinux_status_getenforce(void) { uint32_t seqno; uint32_t enforcing; if (selinux_status == NULL) { errno = EINVAL; return -1; } if (selinux_status == MAP_FAILED) { if (avc_netlink_check_nb() < 0) return -1; return fallback_enforcing; } /* sequence must not be changed during references */ do { seqno = read_sequence(selinux_status); enforcing = selinux_status->enforcing; } while (seqno != read_sequence(selinux_status)); return enforcing ? 1 : 0; }
void cncEnvIn(int argc, char **argv, Context *context) { CNC_REQUIRE(argc == 5, "Usage: %s fileName1 fileName2 tileWidth tileHeight\n", argv[0]); // Open sequence input files FILE *file1 = open_file(argv[1]); FILE *file2 = open_file(argv[2]); size_t filesize1 = file_length(file1); size_t filesize2 = file_length(file2); // Allocate tile data item and read sequence data SeqData *data; size_t dataSize = sizeof(SeqData) + filesize1 + filesize2 + 2; cncHandle_t dataHandle = cncCreateItemSized_data(&data, dataSize); data->seq2offset = filesize1 + 1; size_t length1 = read_sequence(file1, 1, SEQ1(data), filesize1); size_t length2 = read_sequence(file2, 2, SEQ2(data), filesize2); // Tile width and height int tw = atoi(argv[3]); int th = atoi(argv[4]); PRINTF("Tile width: %d\n", tw); PRINTF("Tile height: %d\n", th); CNC_REQUIRE(tw <= length1 && th <= length2, "Tile size too large for given input.\n"); // Number of tiles wide and high int ntw = length1 / tw; int nth = length2 / th; PRINTF("Imported %d x %d tiles.\n", ntw, nth); // Initialize tile dimension data and put data->tw = tw; data->th = th; data->ntw = ntw; data->nth = nth; memcpy(data->score_matrix, ALIGNMENT_SCORES, sizeof(ALIGNMENT_SCORES)); cncPut_data(dataHandle, 0, context); // Record starting time struct timeval *startTime; cncHandle_t startTime_handle = cncCreateItem_startTime(&startTime, 1); gettimeofday(startTime, 0); cncPut_startTime(startTime_handle, 0, context); // Seed edges cncPrescribe_initAboveStep(tw, ntw, context); cncPrescribe_initLeftStep(th, nth, context); int i, j; for(i = 0; i < nth; i++){ for(j = 0; j < ntw; j++){ cncPrescribe_swStep(i, j, context); } } cncPrescribe_cncEnvOut(ntw, nth, tw, context); }
int read_line (FILE* f, INPUT* input, int max_sequence) { int code = read_sequence(f, &input->X, &input->X_length, max_sequence, END_SEQUENCE); if(code) return 0; read_sequence(f, &input->Y, &input->Y_length, max_sequence, END_PAIR); return 1; }
void read_fastq(stream_status& st, sequence_ptr& buff) { size_t read = 0; if(st.have_seam) { memcpy(buff.start, st.seam, mer_len_ - 1); read = mer_len_ - 1; } // Here, the st.stream is assumed to always point to some // sequence (or EOF). Never at header. while(st.stream->good() && read < buf_size_ - mer_len_ - 1) { size_t nread = read_sequence(*st.stream, read, buff.start, '+'); read += nread; st.seq_len += nread; if(st.stream->peek() == '+') { skip_quals(*st.stream, st.seq_len); if(st.stream->good()) { *(buff.start + read++) = 'N'; // Add N between reads ignore_line(*st.stream); // Skip sequence header ++reads_read_; } st.seq_len = 0; } } buff.end = buff.start + read; st.have_seam = read >= (size_t)(mer_len_ - 1); if(st.have_seam) memcpy(st.seam, buff.end - mer_len_ + 1, mer_len_ - 1); }
int test_case1() { char* file_names[] = { "/home/guoqiang/code/0.0.2/MediaSvr/02A805D023585316849992F61018FD8A360239D6.dat", "/home/guoqiang/code/NewMediaServer/new_ms/MediaSvr/02A805D023585316849992F61018FD8A360239D6.dat", "/home/guoqiang/code/NewMediaServer/new_ms/MediaSvr/14C39AB98D3205AC196308FAA27B3485BCB7109C.dat", "/home/guoqiang/code/NewMediaServer/new_ms1/MediaSvr/02A805D023585316849992F61018FD8A360239D6.dat", "/home/guoqiang/code/NewMediaServer/new_ms1/MediaSvr/C102015F1FD646DC1534F8244C4C4A6A2AF0C425.dat", "/home/guoqiang/code/NewMediaServer/new_ms1/MediaSvr/14C39AB98D3205AC196308FAA27B3485BCB7109C.dat", "/home/guoqiang/code/NewMediaServer/xiongm/MediaSvr/02A805D023585316849992F61018FD8A360239D6.dat", "/home/guoqiang/code/NewMediaServer/xiongm/MediaSvr/C102015F1FD646DC1534F8244C4C4A6A2AF0C425.dat", "/home/guoqiang/code/NewMediaServer/xiongm/MediaSvr/14C39AB98D3205AC196308FAA27B3485BCB7109C.dat", "/home/html/02A805D023585316849992F61018FD8A360239D6.dat", "/home/html/C102015F1FD646DC1534F8244C4C4A6A2AF0C425.dat", "/home/html/B421F1690EDF4115D39A0531770FF1D46B57F9FC.dat", }; int index = 0; for(index=0; index<(int)(sizeof(file_names)/sizeof(file_names[0])); index++) { read_sequence(file_names[index]); } return 0; }
extern int read_motifs ( AjPSeqall fdata, /* opened dataset file */ char *filename, /* motif file */ MOTIF motif[NMOTIFS], /* motif info */ BOOLEAN save_dataset, /* return dataset in memory */ DATASET *dataset /* integer-encoded dataset */ ) { int i, nmotifs; FILE *fptr; char seq_name[MSN+1]; HASH_TABLE ht_seq_names; /* hash of dataset seq names */ char *sample_name; /* name of sample from dataset */ char *id; /* id of sample from dataset */ char *sequence; /* sample from dataset */ int length; /* length of sample from dataset */ int seq_no = 0; /* number of sequences in dataset */ /* create a hash table of sequence names */ ht_seq_names = hash_create(DATA_HASH_SIZE); /* hash the names in the dataset so that motif files can be checked for bad ones; column is set to 0 */ while (read_sequence(fdata, &sample_name, &id, &sequence, &length)) { /* skip sequence if there was an error */ if (length < 0) continue; if (hash_lookup(sample_name, 0, ht_seq_names)) { /* printf("Duplicate sequence: %s\n", sample_name); */ myfree(sample_name); myfree(id); myfree(sequence); continue; } hash_insert(sample_name, 0, ht_seq_names); /* put name in hash table */ if (save_dataset) { /* create a sample and put the sample in the array of samples */ if ((seq_no % RCHUNK) == 0) { Resize(dataset->samples, (seq_no + RCHUNK) * (int) sizeof(SAMPLE *), SAMPLE *); } dataset->samples[seq_no] = (SAMPLE *) malloc(sizeof(SAMPLE)); dataset->samples[seq_no]->sample_name = sample_name; dataset->samples[seq_no]->length = length; /* encode the sequence as integers */ for (i=0; i < length; ++i) { int c = (int) sequence[i]; int e = hash(c); if (e == -1) { printf("\nIllegal character %c in sequence %s. ", c, sample_name); printf("Change alphabet or fix data file.\n"); return 0; } sequence[i] = e; } dataset->samples[seq_no]->res = sequence; dataset->n_samples = ++seq_no; /*printf("\r%s", sample_name);*/ } else {
int main(int argc, char **argv) { struct sequence *seq; parse_args(argc, argv); detect_columns(); seq = read_sequence(); render_sequence(seq,opt_columns,opt_rows); return 0; }
/* De-replicates the fasta file fasta_fp against the de-replication database db Inputs: fasta_fp: fasta filepath db: pointer to the de-replication database */ void _serial_dereplication(char* fasta_fp, derep_db* db){ // Open the FASTA file FILE* fd = fopen(fasta_fp, "r"); // Check if we were able to open the file if(fd == NULL) error_handler(FATAL_ERROR, "Error opening file %s", fasta_fp); // Read the first sequence sequence* seq = read_sequence(fd); // Loop through all the file while(seq != NULL){ // Compare if the sequence already exist on the DB dereplicate_db(db, seq); // Read next sequence seq = read_sequence(fd); } // Close the FASTA file fclose(fd); }
int main( int argc, char *argv[]) { if(argc != 3){ printf("Usage: %s <seq1_file> <seq2_file>\n", argv[0]); return 0; } read_sequence(seq1, argv[1]); read_sequence(seq2, argv[2]); M = strlen(seq1); N = strlen(seq2); K = N+M-2*lcs_length(); init_dynamic_tables(); align(); remove_dynamic_tables(); return 0; }
int main(int argc, char **argv) { char *A = read_sequence(argv[1]); char *B = read_sequence(argv[2]); FILE *sm_fp = fopen(argv[3], "r"); ScoreMatrix *sm = scoreMatrixCreate(sm_fp); int M = strlen(A); int N = strlen(B); if (N > M) { alignment(A, B, M, N, sm); } else { alignment(B, A, N, M, sm); } free(A); free(B); return EXIT_SUCCESS; }
SeqType read_fasta(const std::string& filename, const std::string& name) { std::string line, begin_contig=">"+name; SeqType sequence; // open file std::ifstream fastafile ( filename.c_str() , std::ifstream::in ); if (!fastafile.is_open()) { std::stringstream ss; ss<< "Fasta file "<< filename << " cannot be opened."; throw std::domain_error(ss.str().c_str()); } // find contig in multifasta do { getline(fastafile, line); if (fastafile.eof()) { std::stringstream s; s << "Sequence of \"" << name << "\" not found."; throw std::invalid_argument(s.str()); } } while (line!=begin_contig); /* // get first contig line getline(fastafile, line); // while we do not reach a new contig while ((!fastafile.eof())&&(line.substr(0,1) != ">")) { // move read bases into sequence unsigned int filled_until=sequence.size(); sequence.resize(sequence.size()+line.size()); for (unsigned int i=0; i<line.size(); i++) { sequence[i+filled_until]=line[i]; } // read a new line getline(fastafile, line); } */ sequence=read_sequence(fastafile); fastafile.close(); return sequence; }
/* Reads the sequence number `idx` present in the file pointed by fd Returns a pointer to the read sequence structure or NULL if no more sequences are present on the file or the sequence has been already read */ sequence* read_sequence_by_idx(FILE *fd, int idx){ // Check that we don't have already read the sequence if(CURR_SEQ > idx) return NULL; // Skip sequences until the one we have to read sequence* seq; do{ seq = read_sequence(fd); } while(CURR_SEQ <= idx && seq != NULL); // Return the sequence to read return seq; }
/* * selinux_status_deny_unknown * * It returns a guideline to handle undefined object classes or permissions. * 0 means SELinux treats policy queries on undefined stuff being allowed, * however, 1 means such queries are denied. */ int selinux_status_deny_unknown(void) { uint32_t seqno; uint32_t deny_unknown; if (selinux_status == NULL) { errno = EINVAL; return -1; } if (selinux_status == MAP_FAILED) return security_deny_unknown(); /* sequence must not be changed during references */ do { seqno = read_sequence(selinux_status); deny_unknown = selinux_status->deny_unknown; } while (seqno != read_sequence(selinux_status)); return deny_unknown ? 1 : 0; }
int main( int argc, char *argv[]) { srand( time(NULL) ); /* random seed to choose one score between equal score vectores */ if(argc != 4){ printf("Usage: %s <seq1_file> <seq2_file> <mid_bounds_number>\n", argv[0]); return 0; } read_sequence(seq1, argv[1]); read_sequence(seq2, argv[2]); M = strlen(seq1); N = strlen(seq2); init_bounds( atoi(argv[3]) ); init_tables(); align(); traceback(); remove_tables(); remove_bounds_tables(); return 0; }
int main (){ int *vec; int num; int x; int i; int finalQnt = 0; printf("Quanto numeros gostaria de inserir?\n"); scanf("%d",&num); printf("Quantidade de numeros inseridos : %d\n", num); vec = malloc(num*sizeof(int)); read_sequence(vec,num); finalQnt = three_sum_brutal_force(vec,num); printf("Quantidade encontrada : %d\n", finalQnt); free (vec); return 0; }
int main(int argc, char *argv[]){ FILE *fin,*fout; char *sequence=NULL, *seqname=NULL; int comp,reve,verbose, width; long unsigned length; interface(argc, argv, &verbose, &fin, &fout, &reve, &comp, &width); while(fgetc(fin)!='>') ; /* rewind */ while(!feof(fin)){ seqname=read_seqname(fin, seqname); sequence=read_sequence(fin, sequence); length=strlen(sequence); if (comp==1) make_complement_sequence_of(sequence, length); if (reve==1) make_reverse_sequence_of(sequence, length); output(fout,seqname,sequence, verbose, width); } return 0; }
void read_fasta(stream_status& st, sequence_ptr& buff) { size_t read = 0; if(st.have_seam) { memcpy(buff.start, st.seam, mer_len_ - 1); read = mer_len_ - 1; } // Here, the current stream is assumed to always point to some // sequence (or EOF). Never at header. while(st.stream->good() && read < buf_size_ - mer_len_ - 1) { read += read_sequence(*st.stream, read, buff.start, '>'); if(st.stream->peek() == '>') { *(buff.start + read++) = 'N'; // Add N between reads ignore_line(*st.stream); // Skip to next sequence (skip headers, quals, ...) ++reads_read_; } } buff.end = buff.start + read; st.have_seam = read >= (size_t)(mer_len_ - 1); if(st.have_seam) memcpy(st.seam, buff.end - mer_len_ + 1, mer_len_ - 1); }
/* * selinux_status_updated * * It returns whether something has been happened since the last call. * Because `selinux_status->sequence' shall be always incremented on * both of setenforce/policyreload events, so differences from the last * value informs us something has been happened. */ int selinux_status_updated(void) { uint32_t curr_seqno; int result = 0; if (selinux_status == NULL) { errno = EINVAL; return -1; } if (selinux_status == MAP_FAILED) { if (avc_netlink_check_nb() < 0) return -1; curr_seqno = fallback_sequence; } else { curr_seqno = read_sequence(selinux_status); } /* * `curr_seqno' is always even-number, so it does not match with * `last_seqno' being initialized to odd-number in the first call. * We never return 'something was updated' in the first call, * because this function focuses on status-updating since the last * invocation. */ if (last_seqno & 0x0001) last_seqno = curr_seqno; if (last_seqno != curr_seqno) { last_seqno = curr_seqno; result = 1; } return result; }
/** * @brief HTTP::get_line * @return string with retrieved line */ std::string HTTP::get_line() { std::vector<char> buffer; std::string a = read_sequence(buffer, "\r\n"); // reads until endline sequence return a; }
DATASET *read_seq_file( char *file_name, /* name of file to open */ char *alpha, /* alphabet used in sequences */ BOOLEAN use_comp, /* use complementary strands, too */ double seqfrac /* fraction of input sequences to use */ ) { int i, j; FILE *data_file; /* file with samples to read */ FILE *prior_file=NULL; /* file with positional priors to read */ char *sample_name; /* name of sample read */ char *sample_de; /* descriptor text for sample */ char *sequence; /* sequence read */ long length; /* length of sequence */ BOOLEAN error=FALSE; /* none yet */ SAMPLE *sample; /* sample created */ DATASET *dataset; /* dataset created */ int n_samples=0; /* number of samples read */ double *seq_weights=NULL; /* sequence weights */ int n_wgts=0; /* number of sequence weights given */ /* create a hash table of sequence names */ if (!ht_seq_names) ht_seq_names = hash_create(DATA_HASH_SIZE); /* create a dataset */ dataset = (DATASET *) mymalloc(sizeof(DATASET)); dataset->alength = strlen(alpha); dataset->alphabet = alpha; dataset->psp_w = 0; // indicates no PSP was read dataset->log_psp_w = 0; // so log_psp will get initialized /* open data file */ if (file_name == NULL) { fprintf(stderr, "You must specify a data file or `stdin'.\n"); exit(1); } else if (strcmp(file_name, "stdin")) { data_file = fopen(file_name, "r"); if (data_file == NULL) { fprintf(stderr, "Cannot open file `%s'.\n", file_name); exit(1); } } else { data_file = stdin; } /* initialize maximum length of sequences */ dataset->max_slength = 0; dataset->min_slength = 10000000; dataset->n_samples = 0; /* no samples yet */ dataset->samples = NULL; /* no samples */ while (read_sequence(data_file, &sample_name, &sample_de, &sequence, &length)) { /* skip sequence if an error occurred */ if (length < 0) continue; /* parse weights if given; make (more than enough) room in array */ if (strcmp(sample_name, "WEIGHTS")==0) { double wgt; char *wgt_str = sample_de; Resize(seq_weights, n_wgts+(int)strlen(wgt_str), double); while (sscanf(wgt_str, "%lf", &wgt) == 1) { if (wgt <= 0 || wgt > 1) { fprintf(stderr, "Weights must be larger than zero and no greater than 1.\n"); exit(1); } seq_weights[n_wgts++] = wgt; /* save weight */ wgt_str += strspn(wgt_str, " "); /* skip white */ wgt_str += strcspn(wgt_str, " "); /* skip token */ } myfree(sample_name); myfree(sample_de); myfree(sequence); continue; } /* ignore duplicate (same sample name) sequences */ if (hash_lookup_str(sample_name, ht_seq_names) != NULL) { fprintf(stderr, "Skipping sequence '%s'.\n", sample_name); myfree(sample_name); myfree(sample_de); myfree(sequence); continue; } hash_insert_str(sample_name, ht_seq_names); /* put name in hash table */ n_samples++; /* see if sequence will be used in random sample; store it if yes */ if (drand48() >= 1 - seqfrac) { HASH_TABLE_ENTRY *hash_entry; // needed to add pointer to sample /* create the sample */ sample = create_sample(alpha, length, sample_name, sequence, sample_de, use_comp); if (sample == NULL) {error = TRUE; continue;} /* record maximum length of actual sequences */ dataset->max_slength = MAX(sample->length, dataset->max_slength); dataset->min_slength = MIN(sample->length, dataset->min_slength); /* put the sample in the array of samples */ if ((dataset->n_samples % RCHUNK) == 0) { Resize(dataset->samples, dataset->n_samples + RCHUNK, SAMPLE *); }
static int train( const char *train_filename, const char *tune_filename, const char *model_filename) { FILE *train_file = fopen(train_filename, "rb"); FILE *tune_file = fopen(tune_filename, "rb"); if (train_file == NULL) { perror("unable to open training data file"); exit(1); } if (tune_file == NULL) { perror("unable to open tuning data file"); exit(1); } size_t i; const size_t max_items = 0x400; const size_t max_fields = max_items*N_TRAIN_FIELDS; const size_t max_len = 0x10000; uint8_t *field_buf[max_fields]; size_t field_len[max_fields]; size_t n_items; uint8_t buf[max_len]; size_t weights_len = 0x1000000; real *weights = malloc(sizeof(real)*weights_len); // Even elements contain averages of the weights vector, // odd elements contain the time (i.e. value of t) of the last update of // the average. // This is less elegant, but good for cache locality since these values // are accessed randomly at the same time. // TODO: might also want to get the weight vector itself into the same // area. double *average_weights = malloc(sizeof(double)*weights_len*2); double t = 0.0; for (i=0; i<weights_len; i++) weights[i] = (real)0.0; for (i=0; i<weights_len*2; i++) average_weights[i] = 0.0; size_t iter; double tune_error_avg = 1.0; double best_error = 1.0; // First, get file offsets of sentence starts size_t max_sents = 0x100000; size_t n_sents; long *sent_offsets = malloc(sizeof(long)*max_sents); for (n_sents=0; ; n_sents++) { if (n_sents >= max_sents) { return 1; } sent_offsets[n_sents] = ftell(train_file); n_items = max_items; size_t buf_len = max_len; const int rv = read_sequence( train_file, field_buf, field_len, N_TRAIN_FIELDS, &n_items, buf, &buf_len); if (rv < 0) { if (!feof(train_file)) { fprintf(stderr, "Error at %s:%ld (bytes)!\n", train_filename, ftell(train_file)); return 1; } rewind(train_file); break; } } sent_offsets = realloc(sent_offsets, sizeof(long)*n_sents); size_t sent_order[n_sents]; for (i=0; i<n_sents; i++) sent_order[i] = i; fprintf(stderr, "Training data contains %zd sentences\n", n_sents); double tune_error = 1.0; for (iter=0; ; iter++) { shuffle(sent_order, n_sents); fprintf(stderr, "Iteration %zd...\n", iter+1); size_t n_errs = 0; size_t n_total = 0; size_t sent; for (sent=0; sent<n_sents; sent++) { fseek(train_file, sent_offsets[sent_order[sent]], SEEK_SET); n_items = max_items; size_t buf_len = max_len; const int rv = read_sequence( train_file, field_buf, field_len, N_TRAIN_FIELDS, &n_items, buf, &buf_len); if (rv < 0) { fprintf(stderr, "Error at %s:%ld (bytes)!\n", train_filename, ftell(train_file)); return 1; } label gold[n_items]; for (i=0; i<n_items; i++) { const int tag = tagset_from_str( (const char*)field_buf[i*N_TRAIN_FIELDS + COL_TAG]); if (tag < 0) { fprintf(stderr, "Invalid tag: '%s'\n", field_buf[i*N_TRAIN_FIELDS + COL_TAG]); } gold[i] = tag; } n_total += n_items; n_errs += train_sequence( (const uint8_t**)field_buf, field_len, N_TRAIN_FIELDS, n_items, weights, weights_len, gold, t, average_weights); t += 1.0; } fprintf(stderr, " Training error: %.2f%%\n", 100.0*(double)n_errs/(double)n_total); for (i=0; i<weights_len; i++) { average_weights[i*2] += (t - average_weights[i*2+1]) * (double)weights[i]; average_weights[i*2+1] = t; } tune_error = 1.0; real *real_average_weights = malloc(sizeof(real)*weights_len); for (i=0; i<weights_len; i++) real_average_weights[i] = (real)average_weights[i*2]; tag(tune_file, real_average_weights, weights_len, NULL, N_TRAIN_FIELDS, &tune_error); free(real_average_weights); rewind(tune_file); fprintf(stderr, " Tuning error: %.2f%%\n", 100.0*tune_error); if (tune_error < best_error) best_error = tune_error; if (iter == 0) { tune_error_avg = tune_error; } else { if (tune_error > 0.99*tune_error_avg) break; tune_error_avg = tune_error_avg*0.5 + tune_error*0.5; } } fclose(train_file); for (i=0; i<weights_len; i++) weights[i] = (real)average_weights[i*2]; free(average_weights); fprintf(stderr, "Finding optimal feature compression...\n"); if (tune_error > best_error) best_error = tune_error; size_t compression; for (compression=2; ; compression*=2) { real *folded_weights = malloc(sizeof(real)*weights_len/2); for (i=0; i<weights_len/2; i++) folded_weights[i] = weights[i] + weights[weights_len/2 + i]; double tune_error = 1.0; tag(tune_file, folded_weights, weights_len/2, NULL, N_TRAIN_FIELDS, &tune_error); fprintf(stderr, " %zdx compression tuning error: %.2f%%\n", compression, 100.0*tune_error); if (tune_error > 1.01 * best_error) { free(folded_weights); fprintf(stderr, "Selected %zdx compression: 0x%zx features\n", compression/2, weights_len); break; } free(weights); weights = folded_weights; weights_len /= 2; rewind(tune_file); if (tune_error < best_error) best_error = tune_error; } fclose(tune_file); FILE *model = fopen(model_filename, "wb"); if (model == NULL) { perror("unable to open model file for writing"); exit(1); } fwrite(weights, sizeof(real), weights_len, model); fclose(model); free(weights); return 0; }
extern DATASET *read_seq_file( AjPSeqall seqa, /* name of file to open */ char *alpha, /* alphabet used in sequences */ BOOLEAN use_comp, /* use complementary strands, too */ double seqfrac /* fraction of input sequences to use */ ) { int i, j, pcol; /*FILE *data_file;*/ /* file with samples to read */ char *sample_name; /* name of sample read */ char *sample_de; /* descriptor text for sample */ char *sequence; /* sequence read */ int length; /* length of sequence */ BOOLEAN error=FALSE; /* none yet */ SAMPLE *sample; /* sample created */ DATASET *dataset; /* dataset created */ int n_samples=0; /* number of samples read */ double *seq_weights=NULL; /* sequence weights */ int n_wgts=0; /* number of sequence weights given */ /* create a hash table of sequence names */ if (!ht_seq_names) ht_seq_names = hash_create(DATA_HASH_SIZE); /* create a dataset */ dataset = (DATASET *) malloc(sizeof(DATASET)); dataset->alength = strlen(alpha); dataset->alphabet = alpha; dataset->pal = 0; /* not looking for palindromes */ /* initialize maximum length of sequences */ dataset->max_slength = 0; dataset->min_slength = 10000000; dataset->n_samples = 0; /* no samples yet */ dataset->samples = NULL; /* no samples */ while (read_sequence(seqa, &sample_name, &sample_de, &sequence, &length)) { /* skip sequence if an error occurred */ if (length < 0) continue; /* parse weights if given; make (more than enough) room in array */ if (strcmp(sample_name, "WEIGHTS")==0) { double wgt; char *wgt_str = sample_de; Resize(seq_weights, n_wgts+strlen(wgt_str), double); while (sscanf(wgt_str, "%lf", &wgt) == 1) { if (wgt <= 0 || wgt > 1) { fprintf(stderr, "Weights must be larger than zero and no greater than 1.\n"); exit(1); } seq_weights[n_wgts++] = wgt; /* save weight */ wgt_str += strspn(wgt_str, " "); /* skip white */ wgt_str += strcspn(wgt_str, " "); /* skip token */ } myfree(sample_name); myfree(sample_de); myfree(sequence); continue; } /* ignore duplicate (same sample name) sequences */ if (hash_lookup(sample_name, 0, ht_seq_names)) { fprintf(stderr, "Skipping %s\n", sample_name); myfree(sample_name); myfree(sample_de); myfree(sequence); continue; } hash_insert(sample_name, 0, ht_seq_names); /* put name in hash table */ n_samples++; /* see if sequence will be used in random sample; store it if yes */ if (drand48() >= 1 - seqfrac) { /* create the sample */ sample = create_sample(alpha, length, sample_name, sequence); if (sample == NULL) {error = TRUE; continue;} /* record maximum length of actual sequences */ dataset->max_slength = MAX(sample->length, dataset->max_slength); dataset->min_slength = MIN(sample->length, dataset->min_slength); /* put the sample in the array of samples */ if ((dataset->n_samples % RCHUNK) == 0) { Resize(dataset->samples, dataset->n_samples + RCHUNK, SAMPLE *); }
void main (int argc, char **argv, char **env) { /* Set-up */ int I, M, T; /* states, observations, sequence length */ int *X; /* integer array */ double **alpha, **beta, **gamma, ***xi; DHMM posterior, prior; FILE *PRIOR, *DATA, *POSTERIOR; unsigned long seed; /* pseudorandom generator seed */ /* Initialize */ /* To do: 1. Put in argument-checking code here. */ set_seed(&seed, 0); /* read in the prior */ PRIOR = fopen(argv[1], "r"); read_DHMM(PRIOR, &prior); fclose(PRIOR); /* read in the data */ DATA = fopen(argv[2], "r"); read_sequence(DATA, &T, &X); fclose(DATA); /* allocate working space */ I = prior.I; M = prior.M; new_DHMM(&posterior, I, M); alpha = array_double_2d(1, I, 1, T); beta = array_double_2d(1, I, 1, T); gamma = array_double_2d(1, I, 1, T); xi = array_double_3d(1, I, 1, I, 1, T); /* Process */ /* we must initialize the posterior mode or the algorithm */ /* will not work correctly */ initialize_DHMM(&seed, &posterior); /* run the penalized maximum likelihood algorithm and write results */ Penalized_Baum_Welch(&posterior, &prior, T, X, alpha, beta, gamma, xi); /* write out the posterior */ POSTERIOR = fopen(argv[3], "w"); write_DHMM(POSTERIOR, &posterior); fclose(POSTERIOR); /* Clean-up */ save_seed(&seed); free_array_double_2d(alpha, 1, I, 1, T); free_array_double_2d( beta, 1, I, 1, T); free_array_double_2d(gamma, 1, I, 1, T); free_array_double_3d( xi, 1, I, 1, I, 1, T); free_array_int_1d( X, 1, T); free_DHMM(&posterior); free_DHMM(&prior); }
void main (int argc, char **argv, char **env) { /* Start-up */ int I, M, T; /* states, observation types, sequence length */ int *X; /* integer array */ int i, t; /* indices */ double *scale, **alpha, **beta, **gamma, ***xi; DHMM prior, star; FILE *DATA, *PRIOR, *SMOOTH; /* Initialize */ /* To do: 1. Put in argument-checking code here. */ /* read in the prior */ PRIOR = fopen(argv[1], "r"); read_DHMM(PRIOR, &prior); fclose(PRIOR); /* read in the data */ DATA = fopen(argv[2], "r"); read_sequence(DATA, &T, &X); fclose(DATA); /* Debug code */ /* fprintf(stdout, "T: %d and X[T]: %d\n", T, X[T]); */ /* allocate working space */ I = prior.I; M = prior.M; alpha = array_double_2d(1, I, 1, T); beta = array_double_2d(1, I, 1, T); gamma = array_double_2d(1, I, 1, T); xi = array_double_3d(1, I, 1, I, 1, T); new_DHMM(&star, I, M); /* initialize the star matrix */ scale = array_double_1d(1, T); /* initialize scaling array */ /* Process */ /* We use the fact that the normalized gamma and xi arrays * contain respectively the posterior probabilities of * states and observations. */ /* M-step */ Calculate_star(&prior, &star); /* E-step */ /* We do not want to rescale the xi and gamma arrays here */ Forward_Scaled(&star, T, X, alpha, scale); Backward_Scaled(&star, T, X, beta, scale); Accumulate_gamma(&star, T, X, alpha, beta, gamma); /* To do: 1. Put the above and the below into separate utility * routines. */ /* write results */ SMOOTH = fopen(argv[3], "w"); for (t=1; t<=T; t++) { for (i=1; i<=I; i++) { fprintf(SMOOTH, "%12.3f ", gamma[i][t]); } fprintf(SMOOTH, "\n"); } fclose(SMOOTH); /* Clean-up */ free_array_double_1d(scale, 1, T); free_array_double_2d(alpha, 1, I, 1, T); free_array_double_2d( beta, 1, I, 1, T); free_array_double_2d(gamma, 1, I, 1, T); free_array_double_3d( xi, 1, I, 1, I, 1, T); free_array_int_1d( X, 1, T); free_DHMM(&prior); free_DHMM(&star); }