// Returns 'v' (valid value), 'i' (invalid; required GA4GH field missing), // or upon encountering an unexpected token, that token's type. // Explicit `return '?'` means a JSON parsing error, typically a member key // that is not a string. An unexpected token may be a valid token that was // not the type expected for a particular GA4GH field, or it may be '?' or // '\0' which should be propagated. static char parse_ga4gh_redirect_json(hFILE_multipart *fp, hFILE *json, kstring_t *b, kstring_t *header) { hts_json_token t; if (hts_json_fnext(json, &t, b) != '{') return t.type; while (hts_json_fnext(json, &t, b) != '}') { if (t.type != 's') return '?'; if (strcmp(t.str, "urls") == 0) { if (hts_json_fnext(json, &t, b) != '[') return t.type; while (hts_json_fnext(json, &t, b) != ']') { hfile_part *part; size_t n = 0, max = 0; hts_expand(hfile_part, fp->nparts+1, fp->maxparts, fp->parts); part = &fp->parts[fp->nparts++]; part->url = NULL; part->headers = NULL; if (t.type != '{') return t.type; while (hts_json_fnext(json, &t, b) != '}') { if (t.type != 's') return '?'; if (strcmp(t.str, "url") == 0) { if (hts_json_fnext(json, &t, b) != 's') return t.type; part->url = ks_release(b); } else if (strcmp(t.str, "headers") == 0) { if (hts_json_fnext(json, &t, b) != '{') return t.type; while (hts_json_fnext(json, &t, header) != '}') { if (t.type != 's') return '?'; if (hts_json_fnext(json, &t, b) != 's') return t.type; kputs(": ", header); kputs(t.str, header); n++; hts_expand(char *, n+1, max, part->headers); part->headers[n-1] = ks_release(header); part->headers[n] = NULL; } } else if (hts_json_fskip_value(json, '\0') != 'v') return '?'; } if (! part->url) return 'i'; } } else if (strcmp(t.str, "format") == 0) {
// Expands a output filename format string static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx) { kstring_t str = { 0, 0, NULL }; const char* pointer = format_string; const char* next; while ((next = strchr(pointer, '%')) != NULL) { kputsn(pointer, next-pointer, &str); ++next; switch (*next) { case '%': kputc('%', &str); break; case '*': kputs(basename, &str); break; case '#': kputl(rg_idx, &str); break; case '!': kputs(rg_id, &str); break; case '\0': // Error is: fprintf(pysamerr, "bad format string, trailing %%\n"); free(str.s); return NULL; default: // Error is: fprintf(pysamerr, "bad format string, unknown format specifier\n"); free(str.s); return NULL; } pointer = next + 1; } kputs(pointer, &str); return ks_release(&str); }
// Filters a header of @RG lines where ID != id_keep // TODO: strip @PG's descended from other RGs and their descendants static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep) { kstring_t str = {0, 0, NULL}; regex_t rg_finder; if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) { return false; } // regex vars char* header = hdr->text; regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2); kstring_t found_id = { 0, 0, NULL }; int error; while ((error = regexec(&rg_finder, header, 2, matches, 0)) == 0) { kputsn(header, matches[0].rm_so, &str); // copy header up until the found RG line found_id.l = 0; kputsn(header+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &found_id); // extract ID // if it matches keep keep it, else we can just ignore it if (strcmp(ks_str(&found_id), id_keep) == 0) { kputsn(header+matches[0].rm_so, (matches[0].rm_eo+1)-matches[0].rm_so, &str); } // move pointer forward header += matches[0].rm_eo+1; } // cleanup free(found_id.s); free(matches); regfree(&rg_finder); // Did we leave loop because of an error? if (error != REG_NOMATCH) { return false; } // Write remainder of string kputs(header, &str); // Modify header hdr->l_text = ks_len(&str); free(hdr->text); hdr->text = ks_release(&str); return true; }
// Expands a output filename format string static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx, const htsFormat *format) { kstring_t str = { 0, 0, NULL }; const char* pointer = format_string; const char* next; while ((next = strchr(pointer, '%')) != NULL) { kputsn(pointer, next-pointer, &str); ++next; switch (*next) { case '%': kputc('%', &str); break; case '*': kputs(basename, &str); break; case '#': kputl(rg_idx, &str); break; case '!': kputs(rg_id, &str); break; case '.': // Only really need to cope with sam, bam, cram if (format->format != unknown_format) kputs(hts_format_file_extension(format), &str); else kputs("bam", &str); break; case '\0': // Error is: fprintf(pysam_stderr, "bad format string, trailing %%\n"); free(str.s); return NULL; default: // Error is: fprintf(pysam_stderr, "bad format string, unknown format specifier\n"); free(str.s); return NULL; } pointer = next + 1; } kputs(pointer, &str); return ks_release(&str); }
static bool parse_args(int argc, char** argv, parsed_opts_t** opts) { *opts = NULL; int n; if (argc == 1) { usage(stdout); return true; } parsed_opts_t* retval = calloc(1, sizeof(parsed_opts_t)); if (! retval ) { fprintf(stderr, "[%s] Out of memory allocating parsed_opts_t\n", __func__); return false; } // Set defaults retval->mode = overwrite_all; sam_global_args_init(&retval->ga); static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0), { NULL, 0, NULL, 0 } }; kstring_t rg_line = {0,0,NULL}; while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h", lopts, NULL)) >= 0) { switch (n) { case 'r': // Are we adding to existing rg line? if (ks_len(&rg_line) == 0) { if (strlen(optarg)<3 || (optarg[0] != '@' && optarg[1] != 'R' && optarg[2] != 'G')) { kputs("@RG\t", &rg_line); } } else { kputs("\t", &rg_line); } kputs(optarg, &rg_line); break; case 'R': retval->rg_id = strdup(optarg); break; case 'm': { if (strcmp(optarg, "overwrite_all") == 0) { retval->mode = overwrite_all; } else if (strcmp(optarg, "orphan_only") == 0) { retval->mode = orphan_only; } else { usage(stderr); return false; } break; } case 'o': retval->output_name = strdup(optarg); break; case 'h': usage(stdout); free(retval); return true; case '?': usage(stderr); free(retval); return false; case 'O': default: if (parse_sam_global_opt(n, optarg, lopts, &retval->ga) == 0) break; usage(stderr); free(retval); return false; } } retval->rg_line = ks_release(&rg_line); if (argc-optind < 1) { fprintf(stderr, "You must specify an input file.\n"); usage(stderr); cleanup_opts(retval); return false; } if (retval->rg_id && retval->rg_line) { fprintf(stderr, "The options -r and -R are mutually exclusive.\n"); cleanup_opts(retval); return false; } if (retval->rg_line) { char* tmp = basic_unescape(retval->rg_line); if ((retval->rg_id = get_rg_id(tmp)) == NULL) { fprintf(stderr, "[%s] The supplied RG line lacks an ID tag.\n", __func__); free(tmp); cleanup_opts(retval); return false; } retval->rg_line = tmp; } retval->input_name = strdup(argv[optind+0]); *opts = retval; return true; }
int do_grep() { #ifdef DEBUGa printf("[!]do_grep\n"); #endif BamInfo_t *pbam; kh_cstr_t BamID; khiter_t ki, bami; kstring_t ks1 = { 0, 0, NULL }; kstring_t ks2 = { 0, 0, NULL }; kstring_t ks3 = { 0, 0, NULL }; samFile *in; bam_hdr_t *h; hts_idx_t *idx; bam1_t *b, *d, *d2, *bR1, *bR2, *bR3; bR1 = bam_init1(); bR2 = bam_init1(); bR3 = bam_init1(); //htsFile *out; //hts_opt *in_opts = NULL, *out_opts = NULL; int r = 0, exit_code = 0; kvec_t(bam1_t) R1, R2, RV; pierCluster_t *pierCluster; //samdat_t tmp_samdat; FILE *fs = fopen("./test.txt","w"); for (bami = kh_begin(bamNFOp); bami != kh_end(bamNFOp); ++bami) { //printf(">[%d]:\n",bami); if (kh_exist(bamNFOp, bami)) { kv_init(R1); kv_init(R2); kv_init(RV); //tmp_samdat = (const samdat_t){ 0 }; //memset(&tmp_samdat,0,sizeof(samdat_t)); //printf("-[%d]:\n",bami); BamID = kh_key(bamNFOp, bami); pbam = &kh_value(bamNFOp, bami); fprintf(stderr, "%u [%s]=%s\t%u %u\n",bami,BamID,pbam->fileName,pbam->insertSize,pbam->SD); in = sam_open(pbam->fileName, "r"); if (in == NULL) { fprintf(stderr, "[x]Error opening \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } h = sam_hdr_read(in); /* out = hts_open("-", "w"); if (out == NULL) { fprintf(stderr, "[x]Error opening standard output\n"); return EXIT_FAILURE; } if (sam_hdr_write(out, h) < 0) { fprintf(stderr, "[!]Error writing output header.\n"); exit_code = 1; } */ int8_t *ChrIsHum; if (h == NULL) { fprintf(stderr, "[x]Couldn't read header for \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } else { ChrIsHum = malloc(h->n_targets * sizeof(int8_t)); for (int32_t i=0; i < h->n_targets; ++i) { //ChrIsHum[i] = -1; ki = kh_get(chrNFO, chrNFOp, h->target_name[i]); if (ki == kh_end(chrNFOp)) { errx(4,"[x]Cannot find ChrID for [%s] !",h->target_name[i]); } else { ChrInfo_t * tmp = &kh_value(chrNFOp, ki); ChrIsHum[i] = tmp->isHum; //printf(">>> %d Chr:%s %d\n",i,h->target_name[i],ChrIsHum[i]); } } } h->ignore_sam_err = 0; b = bam_init1(); d = bam_init1(); d2 = bam_init1(); if ((idx = sam_index_load(in, pbam->fileName)) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } pierCluster = sam_plp_init(); while ((r = sam_read1(in, h, b)) >= 0) { int8_t flag = false; const bam1_core_t *c = &b->core; if (c->flag & BAM_FSECONDARY) continue; if (c->n_cigar) { uint32_t *cigar = bam_get_cigar(b); for (int i = 0; i < c->n_cigar; ++i) { if (bam_cigar_opchr(cigar[i])=='S') { // soft clipping if ( bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen ) { flag = true; } } } } if (flag && ChrIsHum[c->tid]) { // Now, skip Virus items. //bam_copy1(bR1, b); flag = 0; // recycle //int enoughMapQ = 0; //kstring_t ks = { 0, 0, NULL }; /*if (sam_format1(h, b, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else*/ if ((c->mtid == c->tid && ChrIsHum[c->tid]) || (ChrIsHum[c->tid] ^ ChrIsHum[c->mtid])) { // Only grep those mapped on same Human ChrID, or diff species/一方在病毒的情况. //printf(">[%s]\n",ks_str(&ks1)); flag |= 1; //tmp_samdat.b = bam_dup1(b); //kv_push(samdat_t,R1,tmp_samdat); /*if (checkMapQ(ChrIsHum, b, true)) { ++enoughMapQ; }*/ } if (getPairedSam(in, idx, b, d) != 0) { flag &= ~1; continue; } else { flag |= 2; /*if (checkMapQ(ChrIsHum, d, false)) { ++enoughMapQ; }*/ /*if (c->flag & BAM_FSECONDARY) { if (getPairedSam(in, idx, d, d2) == 0) { //sam_format1(h, d2, &ks3); flag |= 4; if (checkMapQ(ChrIsHum, d2, false)) { ++enoughMapQ; } } }*/ } /* 对于 BAM_FSECONDARY(256) 的 Read,跳两次 与 读 SA 项,效果一样。 >[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 353 chr2 13996555 0 50S40M chr18 48245109 0ACACAACAATGTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:40 AS:i:40 XS:i:40 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,-,40S46M4S,60,0; YC:Z:CT YD:Z:f] -[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 177 chr18 48245109 9 40S50M gi|59585|emb|X04615.1|2000 0 GTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAAAGGAATTCAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:50 AS:i:50 XS:i:46 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,+,50S40M,9,0; YC:Z:GA YD:Z:f] +[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 113 gi|59585|emb|X04615.1| 2000 60 40S46M4S chr18 48245109 0 TTTTTTGGCTGAATAGTATTCCATGGTGTGTGTGTGTGTGGCCTCTGCTCTGTATCGGGAGGCCTTAGAGTCTCCGGAACATTGTTGTGT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:46 AS:i:46 XS:i:27 RG:Z:Fsimout_mB SA:Z:fchr2,13996555,+,50S40M,0,0; YC:Z:CT YD:Z:r] */ /*if (sam_format1(h, d, &ks2) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; }*/ if (((flag & 3) == 3) /*&& enoughMapQ >= myConfig.samples*/) { /*printf(">%d[%s]\n",checkMapQ(ChrIsHum, b, true),ks_str(&ks1)); printf("-%d[%s]\n",checkMapQ(ChrIsHum, d, false),ks_str(&ks2)); if (flag & 4) { printf("+%d[%s]\n",checkMapQ(ChrIsHum, d2, false),ks_str(&ks3)); } printf("<--%d\n",enoughMapQ);*/ if (sam_plp_push(ChrIsHum, pierCluster, b) == 0) { //printf("--HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); if ((!ChrIsHum[(d->core).tid]) && (flag & 2)) sam_plp_push(ChrIsHum, pierCluster, d); //if ((!ChrIsHum[(d2->core).tid]) && (flag & 4)) sam_plp_push(ChrIsHum, pierCluster, d2); } else { //print fprintf(fs,"[%s]\nHumRange=%s:%d-%d\n", BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); fprintf(fs,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos); for (size_t i=0; i<kv_size(pierCluster->Reads);++i) { bam1_t *bi = kv_A(pierCluster->Reads, i); if (sam_format1(h, bi, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else { fprintf(fs,"%s\n",ks1.s); } } fprintf(fs,"\n"); //printf("HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); //fflush(fs); sam_plp_dectroy(pierCluster); pierCluster = sam_plp_init(); } } } /*char *qname = bam_get_qname(b); if (sam_write1(out, h, b) < 0) { fprintf(stderr, "[x]Error writing output.\n"); exit_code = 1; break; }*/ } /* r = sam_close(out); // stdout can only be closed once if (r < 0) { fprintf(stderr, "Error closing output.\n"); exit_code = 1; } */ hts_idx_destroy(idx); bam_destroy1(b); bam_destroy1(d); bam_destroy1(d2); bam_hdr_destroy(h); r = sam_close(in); free(ChrIsHum); #ifdef DEBUGa fflush(NULL); //pressAnyKey(); #endif sam_plp_dectroy(pierCluster); //printf("<[%d]:\n",bami); } } fclose(fs); getPairedSam(NULL, NULL, NULL, NULL); // sam_close(fp2); //printf("---[%d]---\n",exit_code); bam_destroy1(bR1); bam_destroy1(bR2); bam_destroy1(bR3); ks_release(&ks1); ks_release(&ks2); ks_release(&ks3); return exit_code; }
// Parse the header, count the number of RG tags and return a list of their names static bool count_RG(bam_hdr_t* hdr, size_t* count, char*** output_name) { if (hdr->l_text < 3 ) { *count = 0; *output_name = NULL; return true; } kstring_t input = { 0, 0, NULL }; kputsn(hdr->text, hdr->l_text, &input); ////////////////////////////////////////// // First stage count number of @RG tags // ////////////////////////////////////////// char* pointer = ks_str(&input); size_t n_rg = 0; // Guard against rare case where @RG is first header line // This shouldn't happen but could where @HD is omitted if (pointer[0] == '@' && pointer[1] == 'R' && pointer[2] == 'G' ) { ++n_rg; pointer += 3; } char* line; while ((line = strstr(pointer, "\n@RG")) != NULL) { ++n_rg; pointer = line + 1; } ////////////////////////////////// // Second stage locate @RG ID's // ////////////////////////////////// char** names = (char**)calloc(sizeof(char*), n_rg); size_t next = 0; regex_t rg_finder; if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) { free(input.s); free(names); return false; } regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2); int error; char* begin = ks_str(&input); while ((error = regexec(&rg_finder, begin, 2, matches, 0)) == 0) { kstring_t str = { 0, 0, NULL }; kputsn(begin+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &str); names[next++] = ks_release(&str); begin += matches[0].rm_eo; } if (error != REG_NOMATCH) { // cleanup regfree(&rg_finder); free(matches); free(names); free(input.s); return false; } free(matches); // return results *count = n_rg; *output_name = names; regfree(&rg_finder); free(input.s); return true; }
static void trans_tbl_init(bam_hdr_t* out, bam_hdr_t* translate, trans_tbl_t* tbl, bool merge_rg, bool merge_pg) { tbl->n_targets = translate->n_targets; tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int)); tbl->rg_trans = kh_init(c2c); tbl->pg_trans = kh_init(c2c); if (!tbl->tid_trans || !tbl->rg_trans || !tbl->pg_trans) { perror("out of memory"); exit(-1); } int32_t out_len = out->l_text; while (out_len > 0 && out->text[out_len-1] == '\n') {--out_len; } // strip trailing \n's kstring_t out_text = { 0, 0, NULL }; kputsn(out->text, out_len, &out_text); int i, min_tid = -1; tbl->lost_coord_sort = false; khash_t(c2i) *out_tid = kh_init(c2i); for (i = 0; i < out->n_targets; ++i) { int ret; khiter_t iter = kh_put(c2i, out_tid, out->target_name[i], &ret); if (ret <= 0) abort(); kh_value(out_tid, iter) = i; } for (i = 0; i < translate->n_targets; ++i) { khiter_t iter = kh_get(c2i, out_tid, translate->target_name[i]); if (iter == kh_end(out_tid)) { // Append missing entries to out tbl->tid_trans[i] = out->n_targets++; out->target_name = (char**)realloc(out->target_name, sizeof(char*)*out->n_targets); out->target_name[out->n_targets-1] = strdup(translate->target_name[i]); out->target_len = (uint32_t*)realloc(out->target_len, sizeof(uint32_t)*out->n_targets); out->target_len[out->n_targets-1] = translate->target_len[i]; // grep line with regex '^@SQ.*\tSN:%s(\t.*$|$)', translate->target_name[i] // from translate->text regex_t sq_id; regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t)); if (matches == NULL) { perror("out of memory"); exit(-1); } kstring_t seq_regex = { 0, 0, NULL }; ksprintf(&seq_regex, "^@SQ.*\tSN:%s(\t.*$|$)", translate->target_name[i]); regcomp(&sq_id, seq_regex.s, REG_EXTENDED|REG_NEWLINE); free(seq_regex.s); if (regexec(&sq_id, translate->text, 1, matches, 0) != 0) { fprintf(pysamerr, "[trans_tbl_init] @SQ SN (%s) found in binary header but not text header.\n",translate->target_name[i]); exit(1); } regfree(&sq_id); // Produce our output line and append it to out_text kputc('\n', &out_text); kputsn(translate->text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &out_text); free(matches); } else { tbl->tid_trans[i] = kh_value(out_tid, iter); } if (tbl->tid_trans[i] > min_tid) { min_tid = tbl->tid_trans[i]; } else { tbl->lost_coord_sort = true; } } kh_destroy(c2i, out_tid); // grep @RG id's regex_t rg_id; regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t)); if (matches == NULL) { perror("out of memory"); exit(-1); } regcomp(&rg_id, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE); char* text = translate->text; klist_t(hdrln) *rg_list = kl_init(hdrln); while(1) { // foreach rg id in translate's header if (regexec(&rg_id, text, 2, matches, 0) != 0) break; // matches[0] is the whole @RG line; matches[1] is the ID field value kstring_t match_id = { 0, 0, NULL }; kputsn(text+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &match_id); // is our matched ID in our output list already regex_t rg_id_search; kstring_t rg_regex = { 0, 0, NULL }; ksprintf(&rg_regex, "^@RG.*\tID:%s(\t.*$|$)", match_id.s); regcomp(&rg_id_search, rg_regex.s, REG_EXTENDED|REG_NEWLINE|REG_NOSUB); free(rg_regex.s); kstring_t transformed_id = { 0, 0, NULL }; bool transformed_equals_match; if (regexec(&rg_id_search, out->text, 0, NULL, 0) != 0 || merge_rg) { // Not in there so can add it as 1-1 mapping kputs(match_id.s, &transformed_id); transformed_equals_match = true; } else { // It's in there so we need to transform it by appending random number to id ksprintf(&transformed_id, "%s-%0lX", match_id.s, lrand48()); transformed_equals_match = false; } regfree(&rg_id_search); // Insert it into our translation map int in_there = 0; khiter_t iter = kh_put(c2c, tbl->rg_trans, ks_release(&match_id), &in_there); char *transformed_id_s = ks_release(&transformed_id); kh_value(tbl->rg_trans,iter) = transformed_id_s; // take matched line and replace ID with transformed_id kstring_t transformed_line = { 0, 0, NULL }; if (transformed_equals_match) { kputsn(text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &transformed_line); } else { kputsn(text+matches[0].rm_so, matches[1].rm_so-matches[0].rm_so, &transformed_line); kputs(transformed_id_s, &transformed_line); kputsn(text+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line); } if (!(transformed_equals_match && merge_rg)) { // append line to linked list for PG processing char** ln = kl_pushp(hdrln, rg_list); *ln = ks_release(&transformed_line); // Give away to linked list } else free(transformed_line.s); text += matches[0].rm_eo; // next! } regfree(&rg_id); // Do same for PG id's regex_t pg_id; regcomp(&pg_id, "^@PG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE); text = translate->text; klist_t(hdrln) *pg_list = kl_init(hdrln); while(1) { // foreach pg id in translate's header if (regexec(&pg_id, text, 2, matches, 0) != 0) break; kstring_t match_id = { 0, 0, NULL }; kputsn(text+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &match_id); // is our matched ID in our output list already regex_t pg_id_search; kstring_t pg_regex = { 0, 0, NULL }; ksprintf(&pg_regex, "^@PG.*\tID:%s(\t.*$|$)", match_id.s); regcomp(&pg_id_search, pg_regex.s, REG_EXTENDED|REG_NEWLINE|REG_NOSUB); free(pg_regex.s); kstring_t transformed_id = { 0, 0, NULL }; bool transformed_equals_match; if (regexec(&pg_id_search, out->text, 0, NULL, 0) != 0 || merge_pg) { // Not in there so can add it as 1-1 mapping kputs(match_id.s, &transformed_id); transformed_equals_match = true; } else { // It's in there so we need to transform it by appending random number to id ksprintf(&transformed_id, "%s-%0lX", match_id.s, lrand48()); transformed_equals_match = false; } regfree(&pg_id_search); // Insert it into our translation map int in_there = 0; khiter_t iter = kh_put(c2c, tbl->pg_trans, ks_release(&match_id), &in_there); char *transformed_id_s = ks_release(&transformed_id); kh_value(tbl->pg_trans,iter) = transformed_id_s; // take matched line and replace ID with transformed_id kstring_t transformed_line = { 0, 0, NULL }; if (transformed_equals_match) { kputsn(text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &transformed_line); } else { kputsn(text+matches[0].rm_so, matches[1].rm_so-matches[0].rm_so, &transformed_line); kputs(transformed_id_s, &transformed_line); kputsn(text+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line); } if (!(transformed_equals_match && merge_pg)) { // append line to linked list for PP processing char** ln = kl_pushp(hdrln, pg_list); *ln = ks_release(&transformed_line); // Give away to linked list } else free(transformed_line.s); text += matches[0].rm_eo; // next! } regfree(&pg_id); // need to translate PP's on the fly in second pass because they may not be in correct order and need complete tbl->pg_trans to do this // for each line { // with ID replaced with tranformed_id and PP's transformed using the translation table // } regex_t pg_pp; regcomp(&pg_pp, "^@PG.*\tPP:([!-)+-<>-~][!-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE); kliter_t(hdrln) *iter = kl_begin(pg_list); while (iter != kl_end(pg_list)) { char* data = kl_val(iter); kstring_t transformed_line = { 0, 0, NULL }; // Find PP tag if (regexec(&pg_pp, data, 2, matches, 0) == 0) { // Lookup in hash table kstring_t pp_id = { 0, 0, NULL }; kputsn(data+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &pp_id); khiter_t k = kh_get(c2c, tbl->pg_trans, pp_id.s); free(pp_id.s); char* transformed_id = kh_value(tbl->pg_trans,k); // Replace kputsn(data, matches[1].rm_so-matches[0].rm_so, &transformed_line); kputs(transformed_id, &transformed_line); kputsn(data+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line); } else { kputs(data, &transformed_line); } // Produce our output line and append it to out_text kputc('\n', &out_text); kputsn(transformed_line.s, transformed_line.l, &out_text); free(transformed_line.s); free(data); iter = kl_next(iter); } regfree(&pg_pp); // Need to also translate @RG PG's on the fly too regex_t rg_pg; regcomp(&rg_pg, "^@RG.*\tPG:([!-)+-<>-~][!-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE); kliter_t(hdrln) *rg_iter = kl_begin(rg_list); while (rg_iter != kl_end(rg_list)) { char* data = kl_val(rg_iter); kstring_t transformed_line = { 0, 0, NULL }; // Find PG tag if (regexec(&rg_pg, data, 2, matches, 0) == 0) { // Lookup in hash table kstring_t pg_id = { 0, 0, NULL }; kputsn(data+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &pg_id); khiter_t k = kh_get(c2c, tbl->pg_trans, pg_id.s); free(pg_id.s); char* transformed_id = kh_value(tbl->pg_trans,k); // Replace kputsn(data, matches[1].rm_so-matches[0].rm_so, &transformed_line); kputs(transformed_id, &transformed_line); kputsn(data+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line); } else { kputs(data, &transformed_line); } // Produce our output line and append it to out_text kputc('\n', &out_text); kputsn(transformed_line.s, transformed_line.l, &out_text); free(transformed_line.s); free(data); rg_iter = kl_next(rg_iter); } regfree(&rg_pg); kl_destroy(hdrln,pg_list); kl_destroy(hdrln,rg_list); free(matches); // Add trailing \n and write back to header free(out->text); kputc('\n', &out_text); out->l_text = out_text.l; out->text = ks_release(&out_text); }