static int aux_fields1(void) { static const char sam[] = "data:" "@SQ\tSN:one\tLN:1000\n" "@SQ\tSN:two\tLN:500\n" "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tZZ:i:1000000\n"; // Canonical form of the alignment record above, as output by sam_format1() static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tZZ:i:1000000"; samFile *in = sam_open(sam, "r"); bam_hdr_t *header = sam_hdr_read(in); bam1_t *aln = bam_init1(); uint8_t *p; uint32_t n; kstring_t ks = { 0, 0, NULL }; if (sam_read1(in, header, aln) >= 0) { if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k') fail("XA field is '%c', expected 'k'", bam_aux2A(p)); if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37) fail("Xi field is %d, expected 37", bam_aux2i(p)); if ((p = check_bam_aux_get(aln, "Xf", 'f')) && fabs(bam_aux2f(p) - PI) > 1E-6) fail("Xf field is %.12f, expected pi", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "Xd", 'd')) && fabs(bam_aux2f(p) - E) > 1E-6) fail("Xf field is %.12f, expected e", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), HELLO) != 0) fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), HELLO); if ((p = check_bam_aux_get(aln, "XH", 'H')) && strcmp(bam_aux2Z(p), BEEF) != 0) fail("XH field is \"%s\", expected \"%s\"", bam_aux2Z(p), BEEF); // TODO Invent and use bam_aux2B() if ((p = check_bam_aux_get(aln, "XB", 'B')) && ! (memcmp(p, "Bc", 2) == 0 && (memcpy(&n, p+2, 4), n) == 3 && memcmp(p+6, "\xfe\x00\x02", 3) == 0)) fail("XB field is %c,..., expected c,-2,0,+2", p[1]); if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000) fail("ZZ field is %d, expected 1000000", bam_aux2i(p)); if (sam_format1(header, aln, &ks) < 0) fail("can't format record"); if (strcmp(ks.s, r1) != 0) fail("record formatted incorrectly: \"%s\"", ks.s); free(ks.s); } else fail("can't read record"); bam_destroy1(aln); bam_hdr_destroy(header); sam_close(in); return 1; }
int main(int argc, char *argv[]) { bamFile in; sqlite3 * db; sqlite3_stmt * stmt; char * sErrMsg = NULL; char * tail = 0; int nRetCode; char sSQL [BUFFER_SIZE] = "\0"; char database[BUFFER_SIZE]; clock_t startClock,startClock2; if (argc != 2) { fprintf(stderr, "Usage: bamRindex <in.bam>\n"); return 1; } // Open file and exit if error //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb"); //fprintf(stderr,"Options ok\n"); in = bam_open(argv[1], "rb"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } //fprintf(stderr,"BAM opened\n"); assert(strcpy(database,argv[1])!=NULL); assert(strcat(database,".ridx")!=NULL); remove(database); // *********** // Read header bam_header_t *header; header = bam_header_read(in); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); unsigned long num_alns=0; /*********************************************/ /* Open the Database and create the Schema */ // TODO: check the errors sqlite3_open(database, &db); sqlite3_exec(db, TABLE, NULL, NULL, &sErrMsg); // create the table SQLITE_CHECK_ERROR(); startClock = clock(); sqlite3_exec(db, "PRAGMA synchronous = 0;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); sqlite3_exec(db, "PRAGMA journal_mode = OFF;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); // Use up to 8GB of memory sqlite3_exec(db, "PRAGMA cache_size = -8000000;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); sqlite3_exec(db, "BEGIN TRANSACTION;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); while(bam_read1(in,aln)>=0) { // read alignment //aln->core.tid < 0 ? uint8_t *nh = bam_aux_get(aln, "NH"); uint8_t *nm = bam_aux_get(aln, "NM"); uint8_t *xs = bam_aux_get(aln, "XS"); BOOLEAN isPrimary; BOOLEAN isMapped; BOOLEAN notMapped; BOOLEAN isDuplicate; BOOLEAN isNotPassingQualityControls; BOOLEAN isPaired; BOOLEAN isSecondMateRead,isProperPair; //secondary alignment notMapped=(aln->core.flag & BAM_FUNMAP) ? TRUE: FALSE; //notMapped=((aln->core.flag & BAM_FUNMAP) || (aln->core.mtid ==0)) ? TRUE: FALSE; isMapped=!notMapped; isPrimary= (aln->core.flag & BAM_FSECONDARY) ? FALSE:TRUE; isProperPair=(aln->core.flag & BAM_FPROPER_PAIR) ? TRUE:FALSE; isPaired=(aln->core.flag & BAM_FPAIRED ) ? TRUE:FALSE; isSecondMateRead=(aln->core.flag & BAM_FREAD2 ) ? TRUE: FALSE; isNotPassingQualityControls=(aln->core.flag & BAM_FQCFAIL ) ? TRUE:FALSE; isDuplicate=(aln->core.flag & BAM_FDUP) ? TRUE: FALSE; BOOLEAN isSpliced=FALSE; BOOLEAN hasSimpleCigar=TRUE; int nSpliced=0; int i; if (aln->core.n_cigar != 0) { for (i = 0; i < aln->core.n_cigar; ++i) { char l="MIDNSHP=X"[bam1_cigar(aln)[i]&BAM_CIGAR_MASK]; //fprintf(stderr,"%c",l); if ( l == 'N' ) { isSpliced=TRUE; hasSimpleCigar=FALSE;++nSpliced;} if ( l != 'M' && l!='=' ) { hasSimpleCigar=FALSE;} } } //fprintf(stderr,"read %ld\n",num_alns); // isDuplicate,isNotPassingQualityControls, // isSpliced,isPAired,isPrimary,hasSimpleCigar,isSecondMateRead,isProperPair,nh,nm,qual/mapq,xs sprintf(sSQL,"INSERT into bam_index values (%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,'%c')", isDuplicate,isNotPassingQualityControls, nSpliced,isPaired,isPrimary,isMapped,hasSimpleCigar,isSecondMateRead,isProperPair, (nh==0?0:bam_aux2i(nh)),(nm==0?0:bam_aux2i(nm)), aln->core.qual, (xs==0?' ':(bam_aux2A(xs)==0?' ':bam_aux2A(xs)))); sqlite3_exec(db, sSQL, NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); ++num_alns; PRINT_ALNS_PROCESSED(num_alns); } bam_close(in); sqlite3_exec(db, "END TRANSACTION;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); printf("\nImported %d records in %4.2f seconds\n", num_alns, ( (double) (clock() - startClock))/CLOCKS_PER_SEC); // Create the indexes startClock2 = clock(); // generating the indexes does not pay off //sqlite3_exec(db, INDEXES, NULL, NULL, &sErrMsg); //printf("Indexed %d records in %4.2f seconds\n", num_alns, ( (double) (clock() - startClock2))/CLOCKS_PER_SEC); printf("Total time: %4.2f seconds\n", ((double)(clock() - startClock))/CLOCKS_PER_SEC); sqlite3_close(db); return 0; }
bool BamMerge::next_bam_lines(vector<CBamLine>& bam_lines) { if (_lines.size() <= 0) return false; bam_lines.clear(); vector<CBamLine> temp_bam_lines; while (_lines.size() > 0) { CBamLine brec(_lines.top()); //should have the smallest read_id assert (brec.fileno>=0 && brec.b!=NULL); if ((raw_merge || _last_id != brec.read_id) && temp_bam_lines.size() > 0) { break; } _lines.pop(); _last_id = brec.read_id; temp_bam_lines.push_back(brec); //reuse brec brec.b = bam_init1(); if (samread(_src_files[brec.filenum], brec.b)>0) { brec.b_init(_src_files[brec.filenum]->header); _lines.push(brec); } else { //no more BAM records brec.b_free(); } } if (temp_bam_lines.size() <= 0) return false; // we need to eliminate duplicate alignments, which can happen when using Bowtie2 // as we may often map the same read against transcriptome, genome, and novel/known splice junctions. std::sort (temp_bam_lines.begin(), temp_bam_lines.end(), less_bam()); bool sense_strand = false, antisense_strand = false; vector<bool> free_indexes(temp_bam_lines.size(), false); for (size_t i = 0; i < temp_bam_lines.size(); ++i) { bool do_write = true; CBamLine& bam_line = temp_bam_lines[i]; uint8_t* ptr = bam_aux_get(bam_line.b, "XS"); char strand = 0; if (ptr) strand = bam_aux2A(ptr); if (i > 0) { if (equal_bam()(temp_bam_lines[i-1], bam_line)) { if (strand == 0) { do_write = false; } else { if (strand == '+' && sense_strand) do_write = false; else sense_strand = true; if (strand == '-' && antisense_strand) do_write = false; else antisense_strand = true; } } else { sense_strand = false; antisense_strand = false; } } if (strand == '+') sense_strand = true; else if (strand == '-') antisense_strand = true; if (do_write) bam_lines.push_back(bam_line); else free_indexes[i] = true; } for (size_t i = 0; i < free_indexes.size(); ++i) { if (free_indexes[i]) temp_bam_lines[i].b_free(); } return bam_lines.size() > 0; }
static int aux_fields1(void) { static const char sam[] = "data:," "@SQ\tSN:one\tLN:1000\n" "@SQ\tSN:two\tLN:500\n" "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:d:2.46801\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\n"; // Canonical form of the alignment record above, as output by sam_format1() static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" NEW_HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:f:9.8765\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\tN0:i:-1234\tN1:i:1234\tN2:i:-2\tN3:i:3\tF1:f:4.5678\tN4:B:S,65535,32768,1,0\tN5:i:4242"; samFile *in = sam_open(sam, "r"); bam_hdr_t *header = sam_hdr_read(in); bam1_t *aln = bam_init1(); uint8_t *p; kstring_t ks = { 0, 0, NULL }; int64_t b0vals[5] = { -2147483648LL,-1,0,1,2147483647LL }; // i int64_t b1vals[4] = { 0,1,2147483648LL,4294967295LL }; // I int64_t b2vals[5] = { -32768,-1,0,1,32767 }; // s int64_t b3vals[4] = { 0,1,32768,65535 }; // S int64_t b4vals[5] = { -128,-1,0,1,127 }; // c int64_t b5vals[4] = { 0,1,127,255 }; // C // NB: Floats not doubles below! // See https://randomascii.wordpress.com/2012/06/26/doubles-are-not-floats-so-dont-compare-them/ float bfvals[2] = { -3.14159f, 2.71828f }; int8_t n4v1[] = { -128, -64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64, 127 }; uint32_t n4v2[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1234, 5678, 1U << 31, 0 }; int16_t n4v3[] = { -32768, -1, 0, 1, 32767 }; float n4v4[] = { 0, 1, 2, 10, 20, 30, 1.5, -2.5 }; uint8_t n4v5[] = { 0, 255 }; int32_t n4v6[] = { -2147483647 - 1, 10, -1, 0, 1, 2147483647 }; uint16_t n4v7[] = { 65535, 32768, 1, 0 }; int32_t ival = -1234; uint32_t uval = 1234; float f1 = 4.5678; float f2 = 9.8765; size_t nvals, i; if (sam_read1(in, header, aln) >= 0) { if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k') fail("XA field is '%c', expected 'k'", bam_aux2A(p)); bam_aux_del(aln,p); if (bam_aux_get(aln,"XA")) fail("XA field was not deleted"); if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37) fail("Xi field is %"PRId64", expected 37", bam_aux2i(p)); if ((p = check_bam_aux_get(aln, "Xf", 'f')) && fabs(bam_aux2f(p) - PI) > 1E-6) fail("Xf field is %.12f, expected pi", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "Xd", 'd')) && fabs(bam_aux2f(p) - E) > 1E-6) fail("Xf field is %.12f, expected e", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), HELLO) != 0) fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), HELLO); bam_aux_update_str(aln,"XZ",strlen(NEW_HELLO)+1,NEW_HELLO); if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), NEW_HELLO) != 0) fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), NEW_HELLO); if ((p = check_bam_aux_get(aln, "XH", 'H')) && strcmp(bam_aux2Z(p), BEEF) != 0) fail("XH field is \"%s\", expected \"%s\"", bam_aux2Z(p), BEEF); if ((p = check_bam_aux_get(aln, "XB", 'B')) && ! (memcmp(p, "Bc", 2) == 0 && memcmp(p + 2, "\x03\x00\x00\x00\xfe\x00\x02", 7) == 0)) fail("XB field is %c,..., expected c,-2,0,+2", p[1]); check_int_B_array(aln, "B0", NELE(b0vals), b0vals); check_int_B_array(aln, "B1", NELE(b1vals), b1vals); check_int_B_array(aln, "B2", NELE(b2vals), b2vals); check_int_B_array(aln, "B3", NELE(b3vals), b3vals); check_int_B_array(aln, "B4", NELE(b4vals), b4vals); check_int_B_array(aln, "B5", NELE(b5vals), b5vals); nvals = NELE(bfvals); if ((p = check_bam_aux_get(aln, "Bf", 'B')) != NULL) { if (bam_auxB_len(p) != nvals) fail("Wrong length reported for Bf field, got %d, expected %zd\n", bam_auxB_len(p), nvals); for (i = 0; i < nvals; i++) { if (bam_auxB2f(p, i) != bfvals[i]) { fail("Wrong value from bam_auxB2f for Bf field index %zd, " "got %f expected %f\n", i, bam_auxB2f(p, i), bfvals[i]); } } } if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000) fail("ZZ field is %"PRId64", expected 1000000", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y1")) && bam_aux2i(p) != -2147483647-1) fail("Y1 field is %"PRId64", expected -2^31", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y2")) && bam_aux2i(p) != -2147483647) fail("Y2 field is %"PRId64", expected -2^31+1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y3")) && bam_aux2i(p) != -1) fail("Y3 field is %"PRId64", expected -1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y4")) && bam_aux2i(p) != 0) fail("Y4 field is %"PRId64", expected 0", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y5")) && bam_aux2i(p) != 1) fail("Y5 field is %"PRId64", expected 1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y6")) && bam_aux2i(p) != 2147483647) fail("Y6 field is %"PRId64", expected 2^31-1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y7")) && bam_aux2i(p) != 2147483648LL) fail("Y7 field is %"PRId64", expected 2^31", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y8")) && bam_aux2i(p) != 4294967295LL) fail("Y8 field is %"PRId64", expected 2^32-1", bam_aux2i(p)); // Try appending some new tags if (bam_aux_append(aln, "N0", 'i', sizeof(ival), (uint8_t *) &ival) != 0) fail("Failed to append N0:i tag"); if ((p = bam_aux_get(aln, "N0")) && bam_aux2i(p) != ival) fail("N0 field is %"PRId64", expected %d", bam_aux2i(p), ival); if (bam_aux_append(aln, "N1", 'I', sizeof(uval), (uint8_t *) &uval) != 0) fail("failed to append N1:I tag"); if ((p = bam_aux_get(aln, "N1")) && bam_aux2i(p) != uval) fail("N1 field is %"PRId64", expected %u", bam_aux2i(p), uval); // Append tags with bam_aux_update_int() if (bam_aux_update_int(aln, "N2", -2) < 0) fail("failed to append N2:c tag"); if (bam_aux_update_int(aln, "N3", 3) < 0) fail("failed to append N3:C tag"); p = bam_aux_get(aln, "N2"); if (!p) fail("failed to retrieve N2 tag"); else if (*p != 'c' || bam_aux2i(p) != -2) fail("N2 field is %c:%"PRId64", expected c:-2", *p, bam_aux2i(p)); p = bam_aux_get(aln, "N3"); if (!p) fail("failed to retrieve N3 tag"); else if (*p != 'C' || bam_aux2i(p) != 3) fail("N3 field is %c:%"PRId64", expected C:3", *p, bam_aux2i(p)); // Try changing values with bam_aux_update_int() i = test_update_int(aln, "N2", 2, 'C', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", 1234, 'S', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", -1, 's', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", 4294967295U, 'I', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", -2, 'i', "N3", 3, 'C'); // Append a value with bam_aux_update_float() if (bam_aux_update_float(aln, "F1", f1) < 0) fail("append F1:f tag"); p = bam_aux_get(aln, "F1"); if (!p) fail("retrieve F1 tag"); else if (*p != 'f' || bam_aux2f(p) != f1) fail("F1 field is %c:%e, expected f:%e", *p, bam_aux2f(p), f1); // Change a double tag to a float if (bam_aux_update_float(aln, "F2", f2) < 0) fail("update F2 tag"); p = bam_aux_get(aln, "F2"); if (!p) fail("retrieve F2 tag"); else if (*p != 'f' || bam_aux2f(p) != f2) fail("F2 field is %c:%e, expected f:%e", *p, bam_aux2f(p), f2); // Check the next one is intact too p = bam_aux_get(aln, "Y1"); if (!p) fail("retrieve Y1 tag"); else if (*p != 'i' && bam_aux2i(p) != -2147483647-1) fail("Y1 field is %"PRId64", expected -2^31", bam_aux2i(p)); // bam_aux_update_array tests // append a new array i = test_update_array(aln, "N4", 'c', NELE(n4v1), n4v1, "\0\0", 0, 0); // Add a sentinal to check resizes work if (i == 0) i = test_update_int(aln, "N5", 4242, 'S', "\0\0", 0, 0); // alter the array tag a few times if (i == 0) i = test_update_array(aln, "N4", 'I', NELE(n4v2), n4v2, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 's', NELE(n4v3), n4v3, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'f', NELE(n4v4), n4v4, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'c', NELE(n4v5), n4v5, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'i', NELE(n4v6), n4v6, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'S', NELE(n4v7), n4v7, "N5", 4242, 'S'); if (sam_format1(header, aln, &ks) < 0) fail("can't format record"); if (strcmp(ks.s, r1) != 0) fail("record formatted incorrectly: \"%s\"", ks.s); free(ks.s); } else fail("can't read record"); bam_destroy1(aln); bam_hdr_destroy(header); sam_close(in); return 1; }