예제 #1
0
void add_arc(struct graph_adj *g, int id, int from, int to)
{
     struct arc a;

     a.id = id;
     a.to = to;
     ARRAY_PUSH(g->adj_list[from], a);
     g->degree[from].dout++;
     g->degree[to].din++;
     g->n_arcs++;
}
예제 #2
0
파일: bcsplit.c 프로젝트: jlandry69/gcpub
int split_pe(int argc, char *argv[], char *progname)
{
    int c, bc_len = -1, ret, i, j, bc_idx, only_count = 0;
    unsigned num_mismatches = DEFAULT_NUM_MISMATCHES, 
             num_spacer_bases = DEFAULT_NUM_SPACER_BASES, 
             dna_alpha_len = strlen(DNA_ALPHA),
             num_undetermined = 0;
    char *out_prefix = NULL, *fn, **sptr,
         bc_id[1024], bc_seq[1024], bc_seq_cpy[1024]; // hello, buffer overflow
    clock_t t = clock();
    BcRec bc;
    ArrayBcRec bcs;
    FILE *fp;
    kseq_t *seq1, *seq2;
    khash_t(str) *h = kh_init(str);
    khint_t k, k2;
    gzFile *fp1, *fp2;

    ARRAY_INIT(&bcs, BcRec, 1000);

    while ((c = getopt(argc, argv, "m:s:o:c")) >= 0) {
        switch (c) {
            case 'm': if (sscanf(optarg, "%u", &num_mismatches) != 1) {
                          fprintf(stderr, "Error: option -m expects unsigned int\n");
                          return -1;
                      }
                      break;
            case 's': if (sscanf(optarg, "%u", &num_spacer_bases) != 1) {
                          fprintf(stderr, "Error: option -s expects unsigned int\n");
                          return -1;
                      }
                      break;
            case 'o': out_prefix = strdup(optarg);
                      break;
            case 'c': only_count = 1;
                      break;
        }
    }

    if (optind + 3 != argc) {
        print_pe_usage(progname);
        return -1;
    }

    if (num_mismatches != 0 && num_mismatches != 1) {
        fprintf(stderr, "Error: argument -m has to be 0 or 1\n");
        return -1;
    }

    if (out_prefix == NULL) {
        out_prefix = strdup(DEFAULT_OUTPUT_PREFIX);
    }

    for (sptr = argv+optind; sptr-argv<argc; sptr++) {
        if (access(*sptr, F_OK) == -1) {
            fprintf(stderr, "Error: file %s does not exist\n", *sptr);
            return -1;
        }
    }

    fprintf(stderr, "[barcode file: %s]\n", argv[optind]);
    fprintf(stderr, "[fastq file1: %s]\n", argv[optind+1]);
    fprintf(stderr, "[fastq file2: %s]\n", argv[optind+2]);
    fprintf(stderr, "[number of mismatches allowed: %u]\n", num_mismatches);
    fprintf(stderr, "[number of spacer bases: %u]\n", num_spacer_bases);
    fprintf(stderr, "[output prefix: %s]\n", out_prefix);
    fprintf(stderr, "[only count: %s]\n", only_count ? "true" : "false");

    /* read barcode file */
    if ((fp = fopen(argv[optind], "r")) == NULL) {
        fprintf(stderr, "Error: cannot open barcode file %s\n", argv[optind]);
        return -1;
    }

    while (fscanf(fp, "%s %s", bc_id, bc_seq) == 2) {
        bc_len = strlen(bc_seq);
        bc.id = strdup(bc_id);
        bc.seq = strdup(bc_seq);
        bc.num_found = 0;
        if (!only_count) {
            fn = (char*)calloc(strlen(out_prefix) + 3 + strlen(bc_id) + 6 + 1, sizeof(char));
            strcpy(fn, out_prefix);
            strcat(fn, "_1_");
            strcat(fn, bc_id);
            strcat(fn, ".fq.gz");
            bc.fp1 = gzopen(fn, "w");
            fn[strlen(out_prefix)+1] = '2';
            bc.fp2 = gzopen(fn, "w");
            free(fn);
        } else {
            bc.fp1 = NULL;
            bc.fp2 = NULL;
        }
        ARRAY_PUSH(&bcs, BcRec, bc);
        k = kh_put(str, h, strdup(bc_seq), &ret);
        if (num_mismatches == 0) {
            kh_val(h, k) = bcs.nextfree - 1;
            //printf("setting %s to %lu (%s %s)\n", bc_seq, bcs.nextfree - 1, bcs.elems[bcs.nextfree - 1].seq, bcs.elems[bcs.nextfree - 1].id);
        } else {
            for (i=0; i<strlen(bc_seq); i++) {
                strcpy(bc_seq_cpy, bc_seq);
                for (j=0; j<dna_alpha_len; j++) {
                    bc_seq_cpy[i] = DNA_ALPHA[j];
                    k = kh_put(str, h, strdup(bc_seq_cpy), &ret);
                    kh_val(h, k) = bcs.nextfree - 1;
                    //printf("setting %s to %lu (%s %s)\n", bc_seq_cpy, bcs.nextfree - 1, bcs.elems[bcs.nextfree - 1].seq, bcs.elems[bcs.nextfree - 1].id);
                }
            }
        }
    }

    fclose(fp);

    if (bc_len == -1) {
        fprintf(stderr, "Error: could not find any barcodes in file %s\n", argv[optind]);
        return -1;
    }

    fp1 = gzopen(argv[optind+1], "r");
    seq1 = kseq_init(fp1);
    fp2 = gzopen(argv[optind+2], "r");
    seq2 = kseq_init(fp2);

    while (kseq_read(seq1) >= 0) {
        strncpy(bc_seq, seq1->seq.s, bc_len);
        k = kh_get(str, h, bc_seq);
        kseq_read(seq2);
        strncpy(bc_seq, seq2->seq.s, bc_len);
        k2 = kh_get(str, h, bc_seq);
        if (k != kh_end(h) || k2 != kh_end(h)) {
            bc_idx = k2 != kh_end(h) ? kh_val(h, k2) : kh_val(h, k);
            if (!only_count) {
                gzprintf(bcs.elems[bc_idx].fp1, "@%s %s\n%s\n+\n%s\n"
                         , seq1->name.s
                         , seq1->comment.s
                         , seq1->seq.s+bc_len+num_spacer_bases
                         , seq1->qual.s+bc_len+num_spacer_bases);
                gzprintf(bcs.elems[bc_idx].fp2, "@%s %s\n%s\n+\n%s\n"
                         , seq2->name.s
                         , seq2->comment.s
                         , seq2->seq.s+bc_len+num_spacer_bases
                         , seq2->qual.s+bc_len+num_spacer_bases);
            }
            bcs.elems[bc_idx].num_found += 2;
        } else {    
            num_undetermined += 2;
        }
    }
    
    gzclose(fp1);
    gzclose(fp2);
    kseq_destroy(seq1);
    kseq_destroy(seq2);

    for (i=0; i<bcs.nextfree; i++) {
        printf("%s\t%s\t%u\n", bcs.elems[i].id, bcs.elems[i].seq, bcs.elems[i].num_found); 
        if (!only_count) {
            gzclose(bcs.elems[i].fp1);
            gzclose(bcs.elems[i].fp2);
        }
    }

    printf("UNDETERMINED\tNONE\t%u\n", num_undetermined);

    ARRAY_FREE(&bcs);
    kh_destroy(str, h);

    fprintf(stderr, "[CPU time: %.2f sec]\n", (float)(clock() - t) / CLOCKS_PER_SEC);

    return 0;
}