Exemple #1
0
// Returns 'v' (valid value), 'i' (invalid; required GA4GH field missing),
// or upon encountering an unexpected token, that token's type.
// Explicit `return '?'` means a JSON parsing error, typically a member key
// that is not a string.  An unexpected token may be a valid token that was
// not the type expected for a particular GA4GH field, or it may be '?' or
// '\0' which should be propagated.
static char
parse_ga4gh_redirect_json(hFILE_multipart *fp, hFILE *json,
                          kstring_t *b, kstring_t *header)
{
    hts_json_token t;

    if (hts_json_fnext(json, &t, b) != '{') return t.type;
    while (hts_json_fnext(json, &t, b) != '}') {
        if (t.type != 's') return '?';

        if (strcmp(t.str, "urls") == 0) {
            if (hts_json_fnext(json, &t, b) != '[') return t.type;

            while (hts_json_fnext(json, &t, b) != ']') {
                hfile_part *part;
                size_t n = 0, max = 0;

                hts_expand(hfile_part, fp->nparts+1, fp->maxparts, fp->parts);
                part = &fp->parts[fp->nparts++];
                part->url = NULL;
                part->headers = NULL;

                if (t.type != '{') return t.type;
                while (hts_json_fnext(json, &t, b) != '}') {
                    if (t.type != 's') return '?';

                    if (strcmp(t.str, "url") == 0) {
                        if (hts_json_fnext(json, &t, b) != 's') return t.type;
                        part->url = ks_release(b);
                    }
                    else if (strcmp(t.str, "headers") == 0) {
                        if (hts_json_fnext(json, &t, b) != '{') return t.type;

                        while (hts_json_fnext(json, &t, header) != '}') {
                            if (t.type != 's') return '?';

                            if (hts_json_fnext(json, &t, b) != 's')
                                return t.type;

                            kputs(": ", header);
                            kputs(t.str, header);
                            n++;
                            hts_expand(char *, n+1, max, part->headers);
                            part->headers[n-1] = ks_release(header);
                            part->headers[n] = NULL;
                        }
                    }
                    else if (hts_json_fskip_value(json, '\0') != 'v')
                        return '?';
                }

                if (! part->url) return 'i';
            }
        }
        else if (strcmp(t.str, "format") == 0) {
Exemple #2
0
// Expands a output filename format string
static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx)
{
    kstring_t str = { 0, 0, NULL };
    const char* pointer = format_string;
    const char* next;
    while ((next = strchr(pointer, '%')) != NULL) {
        kputsn(pointer, next-pointer, &str);
        ++next;
        switch (*next) {
        case '%':
            kputc('%', &str);
            break;
        case '*':
            kputs(basename, &str);
            break;
        case '#':
            kputl(rg_idx, &str);
            break;
        case '!':
            kputs(rg_id, &str);
            break;
        case '\0':
            // Error is: fprintf(pysamerr, "bad format string, trailing %%\n");
            free(str.s);
            return NULL;
        default:
            // Error is: fprintf(pysamerr, "bad format string, unknown format specifier\n");
            free(str.s);
            return NULL;
        }
        pointer = next + 1;
    }
    kputs(pointer, &str);
    return ks_release(&str);
}
Exemple #3
0
// Filters a header of @RG lines where ID != id_keep
// TODO: strip @PG's descended from other RGs and their descendants
static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep)
{
    kstring_t str = {0, 0, NULL};

    regex_t rg_finder;

    if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) {
        return false;
    }

    // regex vars
    char* header = hdr->text;
    regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2);
    kstring_t found_id = { 0, 0, NULL };
    int error;

    while ((error = regexec(&rg_finder, header, 2, matches, 0)) == 0) {
        kputsn(header, matches[0].rm_so, &str); // copy header up until the found RG line

        found_id.l = 0;
        kputsn(header+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &found_id); // extract ID
        // if it matches keep keep it, else we can just ignore it
        if (strcmp(ks_str(&found_id), id_keep) == 0) {
            kputsn(header+matches[0].rm_so, (matches[0].rm_eo+1)-matches[0].rm_so, &str);
        }
        // move pointer forward
        header += matches[0].rm_eo+1;
    }
    // cleanup
    free(found_id.s);
    free(matches);
    regfree(&rg_finder);
    // Did we leave loop because of an error?
    if (error != REG_NOMATCH) {
        return false;
    }

    // Write remainder of string
    kputs(header, &str);

    // Modify header
    hdr->l_text = ks_len(&str);
    free(hdr->text);
    hdr->text = ks_release(&str);

    return true;
}
Exemple #4
0
// Expands a output filename format string
static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx, const htsFormat *format)
{
    kstring_t str = { 0, 0, NULL };
    const char* pointer = format_string;
    const char* next;
    while ((next = strchr(pointer, '%')) != NULL) {
        kputsn(pointer, next-pointer, &str);
        ++next;
        switch (*next) {
            case '%':
                kputc('%', &str);
                break;
            case '*':
                kputs(basename, &str);
                break;
            case '#':
                kputl(rg_idx, &str);
                break;
            case '!':
                kputs(rg_id, &str);
                break;
            case '.':
                // Only really need to cope with sam, bam, cram
                if (format->format != unknown_format)
                    kputs(hts_format_file_extension(format), &str);
                else
                    kputs("bam", &str);
                break;
            case '\0':
                // Error is: fprintf(pysam_stderr, "bad format string, trailing %%\n");
                free(str.s);
                return NULL;
            default:
                // Error is: fprintf(pysam_stderr, "bad format string, unknown format specifier\n");
                free(str.s);
                return NULL;
        }
        pointer = next + 1;
    }
    kputs(pointer, &str);
    return ks_release(&str);
}
Exemple #5
0
static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
{
    *opts = NULL;
    int n;

    if (argc == 1) { usage(stdout); return true; }

    parsed_opts_t* retval = calloc(1, sizeof(parsed_opts_t));
    if (! retval ) {
        fprintf(stderr, "[%s] Out of memory allocating parsed_opts_t\n", __func__);
        return false;
    }
    // Set defaults
    retval->mode = overwrite_all;
    sam_global_args_init(&retval->ga);
    static const struct option lopts[] = {
        SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0),
        { NULL, 0, NULL, 0 }
    };
    kstring_t rg_line = {0,0,NULL};

    while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h", lopts, NULL)) >= 0) {
        switch (n) {
            case 'r':
                // Are we adding to existing rg line?
                if (ks_len(&rg_line) == 0) {
                    if (strlen(optarg)<3 || (optarg[0] != '@' && optarg[1] != 'R' && optarg[2] != 'G')) {
                        kputs("@RG\t", &rg_line);
                    }
                } else {
                    kputs("\t", &rg_line);
                }
                kputs(optarg, &rg_line);
                break;
            case 'R':
                retval->rg_id = strdup(optarg);
                break;
            case 'm': {
                if (strcmp(optarg, "overwrite_all") == 0) {
                    retval->mode = overwrite_all;
                } else if (strcmp(optarg, "orphan_only") == 0) {
                    retval->mode = orphan_only;
                } else {
                    usage(stderr);
                    return false;
                }
                break;
            }
            case 'o':
                retval->output_name = strdup(optarg);
                break;
            case 'h':
                usage(stdout);
                free(retval);
                return true;
            case '?':
                usage(stderr);
                free(retval);
                return false;
            case 'O':
            default:
                if (parse_sam_global_opt(n, optarg, lopts, &retval->ga) == 0) break;
                usage(stderr);
                free(retval);
                return false;
        }
    }
    retval->rg_line = ks_release(&rg_line);

    if (argc-optind < 1) {
        fprintf(stderr, "You must specify an input file.\n");
        usage(stderr);
        cleanup_opts(retval);
        return false;
    }
    if (retval->rg_id && retval->rg_line) {
        fprintf(stderr, "The options -r and -R are mutually exclusive.\n");
        cleanup_opts(retval);
        return false;
    }

    if (retval->rg_line)
    {
        char* tmp = basic_unescape(retval->rg_line);

        if ((retval->rg_id = get_rg_id(tmp)) == NULL) {
            fprintf(stderr, "[%s] The supplied RG line lacks an ID tag.\n", __func__);
            free(tmp);
            cleanup_opts(retval);
            return false;
        }
        retval->rg_line = tmp;
    }
    retval->input_name = strdup(argv[optind+0]);

    *opts = retval;
    return true;
}
int do_grep() {
#ifdef DEBUGa
	printf("[!]do_grep\n");
#endif
	BamInfo_t *pbam;
	kh_cstr_t BamID;
	khiter_t ki, bami;
	kstring_t ks1 = { 0, 0, NULL };
	kstring_t ks2 = { 0, 0, NULL };
	kstring_t ks3 = { 0, 0, NULL };

	samFile *in;
	bam_hdr_t *h;
	hts_idx_t *idx;
	bam1_t *b, *d, *d2, *bR1, *bR2, *bR3;
	bR1 = bam_init1(); bR2 = bam_init1(); bR3 = bam_init1();
	//htsFile *out;
	//hts_opt *in_opts = NULL, *out_opts = NULL;
	int r = 0, exit_code = 0;

	kvec_t(bam1_t) R1, R2, RV;
	pierCluster_t *pierCluster;
	//samdat_t tmp_samdat;
	FILE *fs = fopen("./test.txt","w");

	for (bami = kh_begin(bamNFOp); bami != kh_end(bamNFOp); ++bami) {
		//printf(">[%d]:\n",bami);
		if (kh_exist(bamNFOp, bami)) {
			kv_init(R1); kv_init(R2); kv_init(RV);
			//tmp_samdat = (const samdat_t){ 0 };
			//memset(&tmp_samdat,0,sizeof(samdat_t));
			//printf("-[%d]:\n",bami);
			BamID = kh_key(bamNFOp, bami);
			pbam = &kh_value(bamNFOp, bami);
			fprintf(stderr, "%u [%s]=%s\t%u %u\n",bami,BamID,pbam->fileName,pbam->insertSize,pbam->SD);

			in = sam_open(pbam->fileName, "r");
			if (in == NULL) {
				fprintf(stderr, "[x]Error opening \"%s\"\n", pbam->fileName);
				return EXIT_FAILURE;
			}
			h = sam_hdr_read(in);
/*			out = hts_open("-", "w");
			if (out == NULL) {
				fprintf(stderr, "[x]Error opening standard output\n");
				return EXIT_FAILURE;
			}
			if (sam_hdr_write(out, h) < 0) {
				fprintf(stderr, "[!]Error writing output header.\n");
				exit_code = 1;
			}
*/
			int8_t *ChrIsHum;
			if (h == NULL) {
				fprintf(stderr, "[x]Couldn't read header for \"%s\"\n", pbam->fileName);
				return EXIT_FAILURE;
			} else {
				ChrIsHum = malloc(h->n_targets * sizeof(int8_t));
				for (int32_t i=0; i < h->n_targets; ++i) {
					//ChrIsHum[i] = -1;
					ki = kh_get(chrNFO, chrNFOp, h->target_name[i]);
					if (ki == kh_end(chrNFOp)) {
						errx(4,"[x]Cannot find ChrID for [%s] !",h->target_name[i]);
					} else {
						ChrInfo_t * tmp = &kh_value(chrNFOp, ki);
						ChrIsHum[i] = tmp->isHum;
						//printf(">>> %d Chr:%s %d\n",i,h->target_name[i],ChrIsHum[i]);
					}
				}
			}
			h->ignore_sam_err = 0;
			b = bam_init1();
			d = bam_init1();
			d2 = bam_init1();
			if ((idx = sam_index_load(in, pbam->fileName)) == 0) {
				fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__);
				return 1;
			}
			pierCluster = sam_plp_init();
			while ((r = sam_read1(in, h, b)) >= 0) {
				int8_t flag = false;
				const bam1_core_t *c = &b->core;
				if (c->flag & BAM_FSECONDARY) continue;
				if (c->n_cigar) {
					uint32_t *cigar = bam_get_cigar(b);
					for (int i = 0; i < c->n_cigar; ++i) {
						if (bam_cigar_opchr(cigar[i])=='S') {	// soft clipping
							if ( bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen ) {
								flag = true;
							}
						}
					}
				}
				if (flag && ChrIsHum[c->tid]) {	// Now, skip Virus items.
					//bam_copy1(bR1, b);
					flag = 0;	// recycle
					//int enoughMapQ = 0;
					//kstring_t ks = { 0, 0, NULL };
					/*if (sam_format1(h, b, &ks1) < 0) {
						fprintf(stderr, "Error writing output.\n");
						exit_code = 1;
						break;
					} else*/ if ((c->mtid == c->tid && ChrIsHum[c->tid]) || (ChrIsHum[c->tid] ^ ChrIsHum[c->mtid])) {	// Only grep those mapped on same Human ChrID, or diff species/一方在病毒的情况.
						//printf(">[%s]\n",ks_str(&ks1));
						flag |= 1;
						//tmp_samdat.b = bam_dup1(b);
						//kv_push(samdat_t,R1,tmp_samdat);
						/*if (checkMapQ(ChrIsHum, b, true)) {
							++enoughMapQ;
						}*/
					}
					if (getPairedSam(in, idx, b, d) != 0) {
						flag &= ~1;
						continue;
					} else {
						flag |= 2;
						/*if (checkMapQ(ChrIsHum, d, false)) {
							++enoughMapQ;
						}*/
						/*if (c->flag & BAM_FSECONDARY) {
							if (getPairedSam(in, idx, d, d2) == 0) {
								//sam_format1(h, d2, &ks3);
								flag |= 4;
								if (checkMapQ(ChrIsHum, d2, false)) {
									++enoughMapQ;
								}
							}
						}*/
					}
/*
对于 BAM_FSECONDARY(256) 的 Read,跳两次 与 读 SA 项,效果一样。
>[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	353	chr2	13996555	0	50S40M	chr18	48245109	0ACACAACAATGTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAA	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:40	AS:i:40	XS:i:40	RG:Z:Fsimout_mB	SA:Z:rgi|59585|emb|X04615.1|,2000,-,40S46M4S,60,0;	YC:Z:CT	YD:Z:f]
-[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	177	chr18	48245109	9	40S50M	gi|59585|emb|X04615.1|2000	0	GTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAAAGGAATTCAA	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:50	AS:i:50	XS:i:46	RG:Z:Fsimout_mB	SA:Z:rgi|59585|emb|X04615.1|,2000,+,50S40M,9,0;	YC:Z:GA	YD:Z:f]
+[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	113	gi|59585|emb|X04615.1|	2000	60	40S46M4S	chr18	48245109	0	TTTTTTGGCTGAATAGTATTCCATGGTGTGTGTGTGTGTGGCCTCTGCTCTGTATCGGGAGGCCTTAGAGTCTCCGGAACATTGTTGTGT	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:46	AS:i:46	XS:i:27	RG:Z:Fsimout_mB	SA:Z:fchr2,13996555,+,50S40M,0,0;	YC:Z:CT	YD:Z:r]
*/
					/*if (sam_format1(h, d, &ks2) < 0) {
						fprintf(stderr, "Error writing output.\n");
						exit_code = 1;
						break;
					}*/
					if (((flag & 3) == 3) /*&& enoughMapQ >= myConfig.samples*/) {
						/*printf(">%d[%s]\n",checkMapQ(ChrIsHum, b, true),ks_str(&ks1));
						printf("-%d[%s]\n",checkMapQ(ChrIsHum, d, false),ks_str(&ks2));
						if (flag & 4) {
							printf("+%d[%s]\n",checkMapQ(ChrIsHum, d2, false),ks_str(&ks3));
						}
						printf("<--%d\n",enoughMapQ);*/
						if (sam_plp_push(ChrIsHum, pierCluster, b) == 0) {
							//printf("--HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							if ((!ChrIsHum[(d->core).tid]) && (flag & 2)) sam_plp_push(ChrIsHum, pierCluster, d);
							//if ((!ChrIsHum[(d2->core).tid]) && (flag & 4)) sam_plp_push(ChrIsHum, pierCluster, d2);
						} else {
							//print
							fprintf(fs,"[%s]\nHumRange=%s:%d-%d\n", BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							fprintf(fs,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos);
							for (size_t i=0; i<kv_size(pierCluster->Reads);++i) {
								bam1_t *bi = kv_A(pierCluster->Reads, i);
								if (sam_format1(h, bi, &ks1) < 0) {
									fprintf(stderr, "Error writing output.\n");
									exit_code = 1;
									break;
								} else {
									fprintf(fs,"%s\n",ks1.s);
								}
							}
							fprintf(fs,"\n");
							//printf("HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							//fflush(fs);
							sam_plp_dectroy(pierCluster);
							pierCluster = sam_plp_init();
						}
					}
				}
				/*char *qname = bam_get_qname(b);
				if (sam_write1(out, h, b) < 0) {
					fprintf(stderr, "[x]Error writing output.\n");
					exit_code = 1;
					break;
				}*/
			}
/*			r = sam_close(out);   // stdout can only be closed once
			if (r < 0) {
				fprintf(stderr, "Error closing output.\n");
				exit_code = 1;
			}
*/
			hts_idx_destroy(idx);
			bam_destroy1(b);
			bam_destroy1(d);
			bam_destroy1(d2);
			bam_hdr_destroy(h);
			r = sam_close(in);
			free(ChrIsHum);
#ifdef DEBUGa
			fflush(NULL);
			//pressAnyKey();
#endif
			sam_plp_dectroy(pierCluster);
			//printf("<[%d]:\n",bami);
		}
	}
	fclose(fs);
	getPairedSam(NULL, NULL, NULL, NULL);	// sam_close(fp2);
	//printf("---[%d]---\n",exit_code);
	bam_destroy1(bR1); bam_destroy1(bR2); bam_destroy1(bR3);
	ks_release(&ks1);
	ks_release(&ks2);
	ks_release(&ks3);
	return exit_code;
}
Exemple #7
0
// Parse the header, count the number of RG tags and return a list of their names
static bool count_RG(bam_hdr_t* hdr, size_t* count, char*** output_name)
{
    if (hdr->l_text < 3 ) {
        *count = 0;
        *output_name = NULL;
        return true;
    }
    kstring_t input = { 0, 0, NULL };
    kputsn(hdr->text, hdr->l_text, &input);

    //////////////////////////////////////////
    // First stage count number of @RG tags //
    //////////////////////////////////////////
    char* pointer = ks_str(&input);
    size_t n_rg = 0;
    // Guard against rare case where @RG is first header line
    // This shouldn't happen but could where @HD is omitted
    if (pointer[0] == '@' && pointer[1] == 'R' && pointer[2] == 'G' ) {
        ++n_rg;
        pointer += 3;
    }
    char* line;
    while ((line = strstr(pointer, "\n@RG")) != NULL) {
        ++n_rg;
        pointer = line + 1;
    }

    //////////////////////////////////
    // Second stage locate @RG ID's //
    //////////////////////////////////
    char** names = (char**)calloc(sizeof(char*), n_rg);
    size_t next = 0;

    regex_t rg_finder;
    if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) {
        free(input.s);
        free(names);
        return false;
    }
    regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2);
    int error;
    char* begin = ks_str(&input);

    while ((error = regexec(&rg_finder, begin, 2, matches, 0)) == 0) {
        kstring_t str = { 0, 0, NULL };
        kputsn(begin+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &str);
        names[next++] = ks_release(&str);
        begin += matches[0].rm_eo;
    }

    if (error != REG_NOMATCH) {
        // cleanup
        regfree(&rg_finder);
        free(matches);
        free(names);
        free(input.s);
        return false;
    }
    free(matches);

    // return results
    *count = n_rg;
    *output_name = names;
    regfree(&rg_finder);
    free(input.s);
    return true;
}
static void trans_tbl_init(bam_hdr_t* out, bam_hdr_t* translate, trans_tbl_t* tbl, bool merge_rg, bool merge_pg)
{
    tbl->n_targets = translate->n_targets;
    tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int));
    tbl->rg_trans = kh_init(c2c);
    tbl->pg_trans = kh_init(c2c);
    if (!tbl->tid_trans || !tbl->rg_trans || !tbl->pg_trans) { perror("out of memory"); exit(-1); }

    int32_t out_len = out->l_text;
    while (out_len > 0 && out->text[out_len-1] == '\n') {--out_len; } // strip trailing \n's
    kstring_t out_text = { 0, 0, NULL };
    kputsn(out->text, out_len, &out_text);

    int i, min_tid = -1;
    tbl->lost_coord_sort = false;

    khash_t(c2i) *out_tid = kh_init(c2i);
    for (i = 0; i < out->n_targets; ++i) {
        int ret;
        khiter_t iter = kh_put(c2i, out_tid, out->target_name[i], &ret);
        if (ret <= 0) abort();
        kh_value(out_tid, iter) = i;
    }

    for (i = 0; i < translate->n_targets; ++i) {
        khiter_t iter = kh_get(c2i, out_tid, translate->target_name[i]);

        if (iter == kh_end(out_tid)) { // Append missing entries to out
            tbl->tid_trans[i] = out->n_targets++;
            out->target_name = (char**)realloc(out->target_name, sizeof(char*)*out->n_targets);
            out->target_name[out->n_targets-1] = strdup(translate->target_name[i]);
            out->target_len = (uint32_t*)realloc(out->target_len, sizeof(uint32_t)*out->n_targets);
            out->target_len[out->n_targets-1] = translate->target_len[i];
            // grep line with regex '^@SQ.*\tSN:%s(\t.*$|$)', translate->target_name[i]
            // from translate->text
            regex_t sq_id;
            regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t));
            if (matches == NULL) { perror("out of memory"); exit(-1); }
            kstring_t seq_regex = { 0, 0, NULL };
            ksprintf(&seq_regex, "^@SQ.*\tSN:%s(\t.*$|$)", translate->target_name[i]);
            regcomp(&sq_id, seq_regex.s, REG_EXTENDED|REG_NEWLINE);
            free(seq_regex.s);
            if (regexec(&sq_id, translate->text, 1, matches, 0) != 0)
            {
                fprintf(pysamerr, "[trans_tbl_init] @SQ SN (%s) found in binary header but not text header.\n",translate->target_name[i]);
                exit(1);
            }
            regfree(&sq_id);

            // Produce our output line and append it to out_text
            kputc('\n', &out_text);
            kputsn(translate->text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &out_text);

            free(matches);
        } else {
            tbl->tid_trans[i] = kh_value(out_tid, iter);
        }
        if (tbl->tid_trans[i] > min_tid) {
            min_tid = tbl->tid_trans[i];
        } else {
            tbl->lost_coord_sort = true;
        }
    }
    kh_destroy(c2i, out_tid);

    // grep @RG id's
    regex_t rg_id;
    regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t));
    if (matches == NULL) { perror("out of memory"); exit(-1); }
    regcomp(&rg_id, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
    char* text = translate->text;
    klist_t(hdrln) *rg_list = kl_init(hdrln);
    while(1) { //   foreach rg id in translate's header
        if (regexec(&rg_id, text, 2, matches, 0) != 0) break;
        // matches[0] is the whole @RG line; matches[1] is the ID field value
        kstring_t match_id = { 0, 0, NULL };
        kputsn(text+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &match_id);

        // is our matched ID in our output list already
        regex_t rg_id_search;
        kstring_t rg_regex = { 0, 0, NULL };
        ksprintf(&rg_regex, "^@RG.*\tID:%s(\t.*$|$)", match_id.s);
        regcomp(&rg_id_search, rg_regex.s, REG_EXTENDED|REG_NEWLINE|REG_NOSUB);
        free(rg_regex.s);
        kstring_t transformed_id = { 0, 0, NULL };
        bool transformed_equals_match;
        if (regexec(&rg_id_search, out->text, 0, NULL, 0) != 0  || merge_rg) {
            // Not in there so can add it as 1-1 mapping
            kputs(match_id.s, &transformed_id);
            transformed_equals_match = true;
        } else {
            // It's in there so we need to transform it by appending random number to id
            ksprintf(&transformed_id, "%s-%0lX", match_id.s, lrand48());
            transformed_equals_match = false;
        }
        regfree(&rg_id_search);

        // Insert it into our translation map
        int in_there = 0;
        khiter_t iter = kh_put(c2c, tbl->rg_trans, ks_release(&match_id), &in_there);
        char *transformed_id_s = ks_release(&transformed_id);
        kh_value(tbl->rg_trans,iter) = transformed_id_s;
        // take matched line and replace ID with transformed_id
        kstring_t transformed_line = { 0, 0, NULL };
        if (transformed_equals_match) {
            kputsn(text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &transformed_line);
        } else {
            kputsn(text+matches[0].rm_so, matches[1].rm_so-matches[0].rm_so, &transformed_line);
            kputs(transformed_id_s, &transformed_line);
            kputsn(text+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
        }

        if (!(transformed_equals_match && merge_rg)) {
            // append line to linked list for PG processing
            char** ln = kl_pushp(hdrln, rg_list);
            *ln = ks_release(&transformed_line);  // Give away to linked list
        }
        else free(transformed_line.s);

        text += matches[0].rm_eo; // next!
    }
    regfree(&rg_id);

    // Do same for PG id's
    regex_t pg_id;
    regcomp(&pg_id, "^@PG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
    text = translate->text;
    klist_t(hdrln) *pg_list = kl_init(hdrln);
    while(1) { //   foreach pg id in translate's header
        if (regexec(&pg_id, text, 2, matches, 0) != 0) break;
        kstring_t match_id = { 0, 0, NULL };
        kputsn(text+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &match_id);

        // is our matched ID in our output list already
        regex_t pg_id_search;
        kstring_t pg_regex = { 0, 0, NULL };
        ksprintf(&pg_regex, "^@PG.*\tID:%s(\t.*$|$)", match_id.s);
        regcomp(&pg_id_search, pg_regex.s, REG_EXTENDED|REG_NEWLINE|REG_NOSUB);
        free(pg_regex.s);
        kstring_t transformed_id = { 0, 0, NULL };
        bool transformed_equals_match;
        if (regexec(&pg_id_search, out->text, 0, NULL, 0) != 0 || merge_pg) {
            // Not in there so can add it as 1-1 mapping
            kputs(match_id.s, &transformed_id);
            transformed_equals_match = true;
        } else {
            // It's in there so we need to transform it by appending random number to id
            ksprintf(&transformed_id, "%s-%0lX", match_id.s, lrand48());
            transformed_equals_match = false;
        }
        regfree(&pg_id_search);

        // Insert it into our translation map
        int in_there = 0;
        khiter_t iter = kh_put(c2c, tbl->pg_trans, ks_release(&match_id), &in_there);
        char *transformed_id_s = ks_release(&transformed_id);
        kh_value(tbl->pg_trans,iter) = transformed_id_s;
        // take matched line and replace ID with transformed_id
        kstring_t transformed_line = { 0, 0, NULL };
        if (transformed_equals_match) {
            kputsn(text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &transformed_line);
        } else {
            kputsn(text+matches[0].rm_so, matches[1].rm_so-matches[0].rm_so, &transformed_line);
            kputs(transformed_id_s, &transformed_line);
            kputsn(text+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
        }

        if (!(transformed_equals_match && merge_pg)) {
            // append line to linked list for PP processing
            char** ln = kl_pushp(hdrln, pg_list);
            *ln = ks_release(&transformed_line);  // Give away to linked list
        }
        else free(transformed_line.s);
        text += matches[0].rm_eo; // next!
    }
    regfree(&pg_id);
    // need to translate PP's on the fly in second pass because they may not be in correct order and need complete tbl->pg_trans to do this
    // for each line {
    // with ID replaced with tranformed_id and PP's transformed using the translation table
    // }
    regex_t pg_pp;
    regcomp(&pg_pp, "^@PG.*\tPP:([!-)+-<>-~][!-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
    kliter_t(hdrln) *iter = kl_begin(pg_list);
    while (iter != kl_end(pg_list)) {
        char* data = kl_val(iter);

        kstring_t transformed_line = { 0, 0, NULL };
        // Find PP tag
        if (regexec(&pg_pp, data, 2, matches, 0) == 0) {
            // Lookup in hash table
            kstring_t pp_id = { 0, 0, NULL };
            kputsn(data+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &pp_id);

            khiter_t k = kh_get(c2c, tbl->pg_trans, pp_id.s);
            free(pp_id.s);
            char* transformed_id = kh_value(tbl->pg_trans,k);
            // Replace
            kputsn(data, matches[1].rm_so-matches[0].rm_so, &transformed_line);
            kputs(transformed_id, &transformed_line);
            kputsn(data+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
        } else { kputs(data, &transformed_line); }
        // Produce our output line and append it to out_text
        kputc('\n', &out_text);
        kputsn(transformed_line.s, transformed_line.l, &out_text);

        free(transformed_line.s);
        free(data);
        iter = kl_next(iter);
    }
    regfree(&pg_pp);

    // Need to also translate @RG PG's on the fly too
    regex_t rg_pg;
    regcomp(&rg_pg, "^@RG.*\tPG:([!-)+-<>-~][!-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE);
    kliter_t(hdrln) *rg_iter = kl_begin(rg_list);
    while (rg_iter != kl_end(rg_list)) {
        char* data = kl_val(rg_iter);

        kstring_t transformed_line = { 0, 0, NULL };
        // Find PG tag
        if (regexec(&rg_pg, data, 2, matches, 0) == 0) {
            // Lookup in hash table
            kstring_t pg_id = { 0, 0, NULL };
            kputsn(data+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &pg_id);

            khiter_t k = kh_get(c2c, tbl->pg_trans, pg_id.s);
            free(pg_id.s);
            char* transformed_id = kh_value(tbl->pg_trans,k);
            // Replace
            kputsn(data, matches[1].rm_so-matches[0].rm_so, &transformed_line);
            kputs(transformed_id, &transformed_line);
            kputsn(data+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line);
        } else { kputs(data, &transformed_line); }
        // Produce our output line and append it to out_text
        kputc('\n', &out_text);
        kputsn(transformed_line.s, transformed_line.l, &out_text);

        free(transformed_line.s);
        free(data);
        rg_iter = kl_next(rg_iter);
    }

    regfree(&rg_pg);
    kl_destroy(hdrln,pg_list);
    kl_destroy(hdrln,rg_list);
    free(matches);

    // Add trailing \n and write back to header
    free(out->text);
    kputc('\n', &out_text);
    out->l_text = out_text.l;
    out->text = ks_release(&out_text);
}