void add_arc(struct graph_adj *g, int id, int from, int to) { struct arc a; a.id = id; a.to = to; ARRAY_PUSH(g->adj_list[from], a); g->degree[from].dout++; g->degree[to].din++; g->n_arcs++; }
int split_pe(int argc, char *argv[], char *progname) { int c, bc_len = -1, ret, i, j, bc_idx, only_count = 0; unsigned num_mismatches = DEFAULT_NUM_MISMATCHES, num_spacer_bases = DEFAULT_NUM_SPACER_BASES, dna_alpha_len = strlen(DNA_ALPHA), num_undetermined = 0; char *out_prefix = NULL, *fn, **sptr, bc_id[1024], bc_seq[1024], bc_seq_cpy[1024]; // hello, buffer overflow clock_t t = clock(); BcRec bc; ArrayBcRec bcs; FILE *fp; kseq_t *seq1, *seq2; khash_t(str) *h = kh_init(str); khint_t k, k2; gzFile *fp1, *fp2; ARRAY_INIT(&bcs, BcRec, 1000); while ((c = getopt(argc, argv, "m:s:o:c")) >= 0) { switch (c) { case 'm': if (sscanf(optarg, "%u", &num_mismatches) != 1) { fprintf(stderr, "Error: option -m expects unsigned int\n"); return -1; } break; case 's': if (sscanf(optarg, "%u", &num_spacer_bases) != 1) { fprintf(stderr, "Error: option -s expects unsigned int\n"); return -1; } break; case 'o': out_prefix = strdup(optarg); break; case 'c': only_count = 1; break; } } if (optind + 3 != argc) { print_pe_usage(progname); return -1; } if (num_mismatches != 0 && num_mismatches != 1) { fprintf(stderr, "Error: argument -m has to be 0 or 1\n"); return -1; } if (out_prefix == NULL) { out_prefix = strdup(DEFAULT_OUTPUT_PREFIX); } for (sptr = argv+optind; sptr-argv<argc; sptr++) { if (access(*sptr, F_OK) == -1) { fprintf(stderr, "Error: file %s does not exist\n", *sptr); return -1; } } fprintf(stderr, "[barcode file: %s]\n", argv[optind]); fprintf(stderr, "[fastq file1: %s]\n", argv[optind+1]); fprintf(stderr, "[fastq file2: %s]\n", argv[optind+2]); fprintf(stderr, "[number of mismatches allowed: %u]\n", num_mismatches); fprintf(stderr, "[number of spacer bases: %u]\n", num_spacer_bases); fprintf(stderr, "[output prefix: %s]\n", out_prefix); fprintf(stderr, "[only count: %s]\n", only_count ? "true" : "false"); /* read barcode file */ if ((fp = fopen(argv[optind], "r")) == NULL) { fprintf(stderr, "Error: cannot open barcode file %s\n", argv[optind]); return -1; } while (fscanf(fp, "%s %s", bc_id, bc_seq) == 2) { bc_len = strlen(bc_seq); bc.id = strdup(bc_id); bc.seq = strdup(bc_seq); bc.num_found = 0; if (!only_count) { fn = (char*)calloc(strlen(out_prefix) + 3 + strlen(bc_id) + 6 + 1, sizeof(char)); strcpy(fn, out_prefix); strcat(fn, "_1_"); strcat(fn, bc_id); strcat(fn, ".fq.gz"); bc.fp1 = gzopen(fn, "w"); fn[strlen(out_prefix)+1] = '2'; bc.fp2 = gzopen(fn, "w"); free(fn); } else { bc.fp1 = NULL; bc.fp2 = NULL; } ARRAY_PUSH(&bcs, BcRec, bc); k = kh_put(str, h, strdup(bc_seq), &ret); if (num_mismatches == 0) { kh_val(h, k) = bcs.nextfree - 1; //printf("setting %s to %lu (%s %s)\n", bc_seq, bcs.nextfree - 1, bcs.elems[bcs.nextfree - 1].seq, bcs.elems[bcs.nextfree - 1].id); } else { for (i=0; i<strlen(bc_seq); i++) { strcpy(bc_seq_cpy, bc_seq); for (j=0; j<dna_alpha_len; j++) { bc_seq_cpy[i] = DNA_ALPHA[j]; k = kh_put(str, h, strdup(bc_seq_cpy), &ret); kh_val(h, k) = bcs.nextfree - 1; //printf("setting %s to %lu (%s %s)\n", bc_seq_cpy, bcs.nextfree - 1, bcs.elems[bcs.nextfree - 1].seq, bcs.elems[bcs.nextfree - 1].id); } } } } fclose(fp); if (bc_len == -1) { fprintf(stderr, "Error: could not find any barcodes in file %s\n", argv[optind]); return -1; } fp1 = gzopen(argv[optind+1], "r"); seq1 = kseq_init(fp1); fp2 = gzopen(argv[optind+2], "r"); seq2 = kseq_init(fp2); while (kseq_read(seq1) >= 0) { strncpy(bc_seq, seq1->seq.s, bc_len); k = kh_get(str, h, bc_seq); kseq_read(seq2); strncpy(bc_seq, seq2->seq.s, bc_len); k2 = kh_get(str, h, bc_seq); if (k != kh_end(h) || k2 != kh_end(h)) { bc_idx = k2 != kh_end(h) ? kh_val(h, k2) : kh_val(h, k); if (!only_count) { gzprintf(bcs.elems[bc_idx].fp1, "@%s %s\n%s\n+\n%s\n" , seq1->name.s , seq1->comment.s , seq1->seq.s+bc_len+num_spacer_bases , seq1->qual.s+bc_len+num_spacer_bases); gzprintf(bcs.elems[bc_idx].fp2, "@%s %s\n%s\n+\n%s\n" , seq2->name.s , seq2->comment.s , seq2->seq.s+bc_len+num_spacer_bases , seq2->qual.s+bc_len+num_spacer_bases); } bcs.elems[bc_idx].num_found += 2; } else { num_undetermined += 2; } } gzclose(fp1); gzclose(fp2); kseq_destroy(seq1); kseq_destroy(seq2); for (i=0; i<bcs.nextfree; i++) { printf("%s\t%s\t%u\n", bcs.elems[i].id, bcs.elems[i].seq, bcs.elems[i].num_found); if (!only_count) { gzclose(bcs.elems[i].fp1); gzclose(bcs.elems[i].fp2); } } printf("UNDETERMINED\tNONE\t%u\n", num_undetermined); ARRAY_FREE(&bcs); kh_destroy(str, h); fprintf(stderr, "[CPU time: %.2f sec]\n", (float)(clock() - t) / CLOCKS_PER_SEC); return 0; }