void apply_index(struct apply_handle *h, int inout, int densitycutoff, int mem_limit, int flags_only) { struct fsm_state *fsm; unsigned int cnt = 0; int i, j, maxtrans, numtrans, laststate, sym; fsm = h->gstates; struct apply_state_index **indexptr, *iptr, *tempiptr; struct pre_index { int state_no; struct pre_index *next; } *pre_index, *tp, *tpp; if (flags_only && !h->has_flags) { return; } /* get numtrans */ for (i=0, laststate = 0, maxtrans = 0, numtrans = 0; (fsm+i)->state_no != -1; i++) { if ((fsm+i)->state_no != laststate) { maxtrans = numtrans > maxtrans ? numtrans : maxtrans; numtrans = 0; } if ((fsm+i)->target != -1) { numtrans++; } laststate = (fsm+i)->state_no; } pre_index = xxcalloc(maxtrans+1, sizeof(struct pre_index)); for (i = 0; i <= maxtrans; i++) { (pre_index+i)->state_no = -1; } /* We create an array of states, indexed by how many transitions they have */ /* so that later, we can traverse them in order densest first, in case we */ /* only want to index to some predefined maximum memory usage. */ for (i = 0, laststate = 0, maxtrans = 0, numtrans = 0; (fsm+i)->state_no != -1; i++) { if ((fsm+i)->state_no != laststate) { if ((pre_index+numtrans)->state_no == -1) { (pre_index+numtrans)->state_no = laststate; } else { tp = xxcalloc(1, sizeof(struct pre_index)); tp->state_no = laststate; tp->next = (pre_index+numtrans)->next; (pre_index+numtrans)->next = tp; } maxtrans = numtrans > maxtrans ? numtrans : maxtrans; numtrans = 0; } if ((fsm+i)->target != -1) { numtrans++; } laststate = (fsm+i)->state_no; } indexptr = NULL; cnt += round_up_to_power_of_two(h->last_net->statecount*sizeof(struct apply_state_index *)); if (cnt > mem_limit) { cnt -= round_up_to_power_of_two(h->last_net->statecount*sizeof(struct apply_state_index *)); goto memlimitnoindex; } indexptr = xxcalloc(h->last_net->statecount, sizeof(struct apply_state_index *)); if (h->has_flags && flags_only) { /* Mark states that have flags */ if (!(h->flagstates)) { apply_mark_flagstates(h); } } for (i = maxtrans; i >= 0; i--) { for (tp = pre_index+i; tp != NULL; tp = tp->next) { if (tp->state_no >= 0) { if (i < densitycutoff) { if (!(h->has_flags && flags_only && BITTEST(h->flagstates, tp->state_no))) { continue; } } cnt += round_up_to_power_of_two(h->sigma_size*sizeof(struct apply_state_index)); if (cnt > mem_limit) { cnt -= round_up_to_power_of_two(h->sigma_size*sizeof(struct apply_state_index)); goto memlimit; } *(indexptr + tp->state_no) = xxmalloc(h->sigma_size*sizeof(struct apply_state_index)); /* We make the tail of all index linked lists point to the index */ /* for EPSILON, so that we automatically when EPSILON transitions */ /* also when traversing an index. */ for (j = 0; j < h->sigma_size; j++) { (*(indexptr + tp->state_no) + j)->fsmptr = -1; if (j == EPSILON) (*(indexptr + tp->state_no) + j)->next = NULL; else (*(indexptr + tp->state_no) + j)->next = (*(indexptr + tp->state_no)); /* all tails point to epsilon */ } } } } memlimit: for (i=0; (fsm+i)->state_no != -1; i++) { iptr = *(indexptr + (fsm+i)->state_no); if (iptr == NULL || (fsm+i)->target == -1) { continue; } sym = inout == APPLY_INDEX_INPUT ? (fsm+i)->in : (fsm+i)->out; if (h->has_flags && (h->flag_lookup+sym)->type) { sym = EPSILON; } if (sym == UNKNOWN) { /* We make the index of UNKNOWN point to IDENTITY */ sym = IDENTITY; /* since these are really the same symbol */ } if ((iptr+sym)->fsmptr == -1) { (iptr+sym)->fsmptr = i; } else { cnt += round_up_to_power_of_two(sizeof(struct apply_state_index)); tempiptr = xxcalloc(1, sizeof(struct apply_state_index)); tempiptr->next = (iptr+sym)->next; tempiptr->fsmptr = i; (iptr+sym)->next = tempiptr; } } /* Free preindex */ memlimitnoindex: for (i = maxtrans; i >= 0; i--) { for (tp = (pre_index+i)->next; tp != NULL; tp = tpp) { tpp = tp->next; xxfree(tp); } } xxfree(pre_index); if (inout == APPLY_INDEX_INPUT) { h->index_in = indexptr; } else { h->index_out = indexptr; } }
int main(int argc, char *argv[]) { int ret = 0; if (argc < 2) { fprintf(stderr, "Usage: transalign_killer [--cldev=x.y] <input file>\n"); fprintf(stderr, " --cldev=x.y: x specifies the platform index, y the device index.\n"); return 1; } long seq_length; char *sequence = load_text(argv[argc - 1], &seq_length); if (!sequence) return 1; seq_length--; // Cut final 0 byte // FIXME: All the following code relies on seq_length being a multiple of BASE. long round_seq_length = round_up_to_power_of_two(seq_length, BASE_EXP); long res_length = 0; for (long len = round_seq_length / BASE; len; len /= BASE) res_length += len; // Use some random index to be searched for here unsigned letter_index = seq_length / 2; // Select an OpenCL device cl_device_id dev = select_device(argc - 1, argv); if (!dev) return 1; // Initialize the OpenCL st...ack cl_context ctx = clCreateContext(NULL, 1, &dev, NULL, NULL, NULL); cl_command_queue queue = clCreateCommandQueue(ctx, dev, 0, NULL); // Load the OpenCL kernesl char *prog_src = load_text("trans.cl", NULL); if (!prog_src) return 1; cl_program prog = clCreateProgramWithSource(ctx, 1, (const char **)&prog_src, NULL, NULL); free(prog_src); // Build them clBuildProgram(prog, 0, NULL, NULL, NULL, NULL); cl_kernel k_iadd = clCreateKernel(prog, "k_iadd", NULL); // initial addition cl_kernel k_cadd = clCreateKernel(prog, "k_cadd", NULL); // consecutive addition assert(k_iadd); assert(k_cadd); // Create the result buffer unsigned *result = malloc(res_length * sizeof(unsigned)); cl_mem result_gpu = clCreateBuffer(ctx, CL_MEM_READ_WRITE | HOST_PTR_POLICY, res_length * sizeof(unsigned), result, NULL); clock_start(); /*** START OF ROCKET SCIENCE LEVEL RUNTIME-TIME INTENSIVE STUFF ***/ // Bandwidth intensive stuff goes here // Copy the sequence to the video memory (or, generally speaking, the OpenCL device) cl_mem seq_gpu = clCreateBuffer(ctx, CL_MEM_READ_WRITE | HOST_PTR_POLICY, seq_length * sizeof(char), sequence, NULL); long bw1_time = clock_delta(); // GPU intensive stuff goes here /** * First, transform every - and \0 into a 0 and every other character into a * 1. Then, add consecutive fields (BASE fields) together and store them at * the beginning of the result buffer. */ clSetKernelArg(k_iadd, 0, sizeof(result_gpu), &result_gpu); clSetKernelArg(k_iadd, 1, sizeof(seq_gpu), &seq_gpu); clSetKernelArg(k_iadd, 2, sizeof(unsigned), &(unsigned){seq_length});