/* * Unpacks the 31-byte fixed size part of the SFF common header. * It allocates memory for this and for the flow order and key, but does * not read the flow & key information (as this may not be in buf). * It also checks that the MAGIC and VERSION match as expected. * * Returns sff_common_header* on success * NULL on failure */ sff_common_header *decode_sff_common_header(unsigned char *buf) { sff_common_header *h; if (NULL == (h = (sff_common_header *)xcalloc(1, sizeof(*h)))) return NULL; h->magic = be_int4(*(uint32_t *)(buf+0)); memcpy(h->version, buf+4, 4); h->index_offset = be_int8(*(uint64_t *)(buf+8)); h->index_len = be_int4(*(uint32_t *)(buf+16)); h->nreads = be_int4(*(uint32_t *)(buf+20)); h->header_len = be_int2(*(uint16_t *)(buf+24)); h->key_len = be_int2(*(uint16_t *)(buf+26)); h->flow_len = be_int2(*(uint16_t *)(buf+28)); h->flowgram_format = be_int1(*(uint8_t *)(buf+30)); if (h->magic != SFF_MAGIC || memcmp(h->version, SFF_VERSION, 4)) { xfree(h); return NULL; } if (NULL == (h->flow = (char *)xmalloc(h->flow_len))) return free_sff_common_header(h), NULL; if (NULL == (h->key = (char *)xmalloc(h->key_len))) return free_sff_common_header(h), NULL; return h; }
/* * ztr_read_chunk_hdr * * Reads a ZTR chunk header and metadata, but not the main data segment. * * Arguments: * fp A FILE pointer * * Returns: * Success: a chunk pointer (malloced) * Failure: NULL */ ztr_chunk_t *ztr_read_chunk_hdr(FILE *fp) { int4 bei4; ztr_chunk_t *chunk; if (NULL == (chunk = (ztr_chunk_t *)xmalloc(sizeof(*chunk)))) return NULL; /* type */ if (1 != fread(&bei4, 4, 1, fp)) { xfree(chunk); return NULL; } chunk->type = be_int4(bei4); /* metadata length */ if (1 != fread(&bei4, 4, 1, fp)) { xfree(chunk); return NULL; } chunk->mdlength = be_int4(bei4); /* metadata */ chunk->ztr_owns = 1; if (chunk->mdlength) { if (NULL == (chunk->mdata = (char *)xmalloc(chunk->mdlength))) { xfree(chunk); return NULL; } if (chunk->mdlength != fread(chunk->mdata, 1, chunk->mdlength, fp)) { xfree(chunk->mdata); xfree(chunk); return NULL; } } else { chunk->mdata = NULL; } /* data length */ if (1 != fread(&bei4, 4, 1, fp)) { if (chunk->mdata) xfree(chunk->mdata); xfree(chunk); return NULL; } chunk->dlength = be_int4(bei4); return chunk; }
/* * Encodes the data in 'h' to the file SFF representation. Buf should be * allocated to be 31 + h->flow_len + h->key_len + 8. * * Returns: the written length of buf */ int encode_sff_common_header(sff_common_header *h, unsigned char *buf) { int end; *(uint32_t *)(buf+0) = be_int4(h->magic); memcpy(buf+4, h->version, 4); *(uint64_t *)(buf+8) = be_int8(h->index_offset); *(uint32_t *)(buf+16) = be_int4(h->index_len); *(uint32_t *)(buf+20) = be_int4(h->nreads); *(uint16_t *)(buf+24) = be_int2(h->header_len); *(uint16_t *)(buf+26) = be_int2(h->key_len); *(uint16_t *)(buf+28) = be_int2(h->flow_len); *(uint8_t *)(buf+30) = be_int1(h->flowgram_format); memcpy(buf+31, h->flow, h->flow_len); memcpy(buf+31+h->flow_len, h->key, h->key_len); end = 31+h->flow_len+h->key_len; memcpy(buf+end, "\0\0\0\0\0\0\0\0", ((end+7)&~7)-end); return (end+7)&~7; }
/* * ztr_write_chunk * * Writes a ZTR chunk including chunk header and data * * Arguments: * fp A FILE pointer * chunk A pointer to the chunk to write * * Returns: * Success: 0 * Failure: -1 */ static int ztr_write_chunk(FILE *fp, ztr_chunk_t *chunk) { int4 bei4; /* { char str[5]; fprintf(stderr, "Write chunk %.4s %08x length %d\n", ZTR_BE2STR(chunk->type, str), chunk->type, chunk->dlength); } */ /* type */ bei4 = be_int4(chunk->type); if (1 != fwrite(&bei4, 4, 1, fp)) return -1; /* metadata length */ bei4 = be_int4(chunk->mdlength); if (1 != fwrite(&bei4, 4, 1, fp)) return -1; /* metadata */ if (chunk->mdlength) if (chunk->mdlength != fwrite(chunk->mdata, 1, chunk->mdlength, fp)) return -1; /* data length */ bei4 = be_int4(chunk->dlength); if (1 != fwrite(&bei4, 4, 1, fp)) return -1; /* data */ if (chunk->dlength != fwrite(chunk->data, 1, chunk->dlength, fp)) return -1; return 0; }
/* * Encodes the data in 'h' to the file SFF representation. Buf should be * allocated to be 16 + h->name_len + 8. * * Returns: the written length of buf */ int encode_sff_read_header(sff_read_header *h, unsigned char *buf) { int end; *(uint16_t *)(buf+0) = be_int2(h->header_len); *(uint16_t *)(buf+2) = be_int2(h->name_len); *(uint32_t *)(buf+4) = be_int4(h->nbases); *(uint16_t *)(buf+8) = be_int2(h->clip_qual_left); *(uint16_t *)(buf+10) = be_int2(h->clip_qual_right); *(uint16_t *)(buf+12) = be_int2(h->clip_adapter_left); *(uint16_t *)(buf+14) = be_int2(h->clip_adapter_right); memcpy(buf+16, h->name, h->name_len); end = 16+h->name_len; memcpy(buf+end, "\0\0\0\0\0\0\0\0", ((end+7)&~7)-end); return (end+7)&~7; }
int read_scf_base(FILE *fp, Bases *b) { uint_1 buf[12]; if (1 != fread(buf, 12, 1, fp)) return -1; b->peak_index = be_int4(((uint_4 *)buf)[0]); b->prob_A = buf[4]; b->prob_C = buf[5]; b->prob_G = buf[6]; b->prob_T = buf[7]; b->base = buf[8]; b->spare[0] = buf[9]; b->spare[1] = buf[10]; b->spare[2] = buf[11]; return 0; }
int write_scf_base(FILE *fp, Bases *b) { uint_1 buf[12]; ((uint_4 *)buf)[0] = be_int4(b->peak_index); buf[4] = b->prob_A; buf[5] = b->prob_C; buf[6] = b->prob_G; buf[7] = b->prob_T; buf[8] = b->base; buf[9] = b->spare[0]; buf[10] = b->spare[1]; buf[11] = b->spare[2]; if (12 != fwrite(buf, 1, 12, fp)) return -1; return 0; }
/* * Unpacks the 16-byte fixed size part of the SFF read header. * It allocates memory for this and for the base calls, but does not * unpack these. * * Returns sff_read_header* on success * NULL on failure */ sff_read_header *decode_sff_read_header(unsigned char *buf) { sff_read_header *h; if (NULL == (h = (sff_read_header *)xcalloc(1, sizeof(*h)))) return NULL; h->header_len = be_int2(*(uint16_t *)(buf+0)); h->name_len = be_int2(*(uint16_t *)(buf+2)); h->nbases = be_int4(*(uint32_t *)(buf+4)); h->clip_qual_left = be_int2(*(uint16_t *)(buf+8)); h->clip_qual_right = be_int2(*(uint16_t *)(buf+10)); h->clip_adapter_left = be_int2(*(uint16_t *)(buf+12)); h->clip_adapter_right = be_int2(*(uint16_t *)(buf+14)); if (NULL == (h->name = (char *)xmalloc(h->name_len))) return free_sff_read_header(h), NULL; return h; }
int read_scf_base(FILE *fp, Bases *b) { union { uint_1 u1[12]; uint_4 u4[3]; } buf; if (1 != fread(buf.u1, 12, 1, fp)) return -1; b->peak_index = be_int4(buf.u4[0]); b->prob_A = buf.u1[4]; b->prob_C = buf.u1[5]; b->prob_G = buf.u1[6]; b->prob_T = buf.u1[7]; b->base = buf.u1[8]; b->spare[0] = buf.u1[9]; b->spare[1] = buf.u1[10]; b->spare[2] = buf.u1[11]; return 0; }
int write_scf_bases3(FILE *fp, Bases *b, size_t num_bases) { size_t i; uint_4 *buf4; uint_1 *buf1; if (NULL == (buf4 = (uint_4 *)xmalloc(1 + 4 * num_bases))) return -1; if (NULL == (buf1 = (uint_1 *)xmalloc(1 + 8 * num_bases))) { xfree(buf4); return -1; } for (i = 0; i < num_bases; i++) { buf4[i] = be_int4((&b[i])->peak_index); } fwrite(buf4, 4, num_bases, fp); for (i=0; i < num_bases; i++) { buf1[i ] = (&b[i])->prob_A; buf1[i+ num_bases] = (&b[i])->prob_C; buf1[i+2*num_bases] = (&b[i])->prob_G; buf1[i+3*num_bases] = (&b[i])->prob_T; buf1[i+4*num_bases] = (&b[i])->base; buf1[i+5*num_bases] = (&b[i])->spare[0]; buf1[i+6*num_bases] = (&b[i])->spare[1]; buf1[i+7*num_bases] = (&b[i])->spare[2]; } if (8 * num_bases != (fwrite(buf1, 1, 8 * num_bases, fp))) { xfree(buf1); xfree(buf4); return -1; } xfree(buf1); xfree(buf4); return 0; }
int read_scf_bases3(FILE *fp, Bases *b, size_t num_bases) { size_t i; uint_4 *buf4; uint_1 *buf1; if (NULL == (buf4 = (uint_4 *)xmalloc(1 + 4 * num_bases))) return -1; if (NULL == (buf1 = (uint_1 *)xmalloc(1 + 8 * num_bases))) { xfree(buf4); return -1; } if (num_bases != fread(buf4, 4, num_bases, fp)) return -1; for (i=0; i < num_bases; i++) (&b[i])->peak_index = be_int4(buf4[i]); if (8 * num_bases != fread(buf1, 1, 8 * num_bases, fp)) return -1; for (i=0; i < num_bases; i++) { (&b[i])->prob_A = buf1[i]; (&b[i])->prob_C = buf1[i+num_bases]; (&b[i])->prob_G = buf1[i+2*num_bases]; (&b[i])->prob_T = buf1[i+3*num_bases]; (&b[i])->base = buf1[i+4*num_bases]; (&b[i])->spare[0] = buf1[i+5*num_bases]; (&b[i])->spare[1] = buf1[i+6*num_bases]; (&b[i])->spare[2] = buf1[i+7*num_bases]; } xfree(buf4); xfree(buf1); return 0; }
/* * Parse the REGN chunk, add to regn HASH * * Returns corresponding HashItem * from regn Hash */ HashItem *parse_regn(ztr_t *z, ztr_chunk_t *chunk, HashTable *regn_hash) { char key[1024]; char *name; HashItem *hi; regn_t *regn; size_t l; uncompress_chunk(z, chunk); /* the hash key is a combination of the region names and boundaries */ name = ztr_lookup_mdata_value(z, chunk, "NAME"); l = snprintf(key, sizeof(key), "names=%s", name); if( chunk->dlength ){ int nbndy = (chunk->dlength-1)/4; uint4 *bndy = (uint4 *)(chunk->data+1); int ibndy; for (ibndy=0; ibndy<nbndy; ibndy++) { if( ibndy ) l += snprintf(key + l, sizeof(key) - l, ";%d", be_int4(bndy[ibndy])); else l += snprintf(key + l, sizeof(key) - l, " boundaries=%d", be_int4(bndy[ibndy])); } } if (NULL == (hi = (HashTableSearch(regn_hash, key, strlen(key))))) { int iregion, nregions = 0; char *coord; char *cp1; uint4 bndy[MAX_REGIONS]; int ibndy, nbndy = 0; HashData hd; if( NULL == (regn = (regn_t *)malloc(sizeof(regn_t)))) { return NULL; } coord = ztr_lookup_mdata_value(z, chunk, "COORD"); regn->coord = (NULL == coord ? 'B' : *coord ); regn->region_names = strdup(name); cp1 = strtok (regn->region_names,";"); while(cp1) { char *cp2; if(NULL == (cp2 = strchr(cp1,':'))) { fprintf(stderr, "Invalid region name/code pair %s\n", cp1); return NULL; } *cp2++ = '\0'; regn->name[nregions] = cp1; regn->code[nregions] = *cp2; nregions++; cp1 = strtok (NULL, ";"); } regn->nregions = nregions; if( chunk->dlength ) { nbndy = (chunk->dlength-1)/4; memcpy(bndy, chunk->data+1, chunk->dlength-1); } for( iregion=0, ibndy=0; iregion<nregions; iregion++) { /* start = (start + length of previous region) or 0 if no previous region */ /* length = (next boundary - start of region) or -1 if no next boundary */ if( regn->code[iregion] == 'E' ){ /* no sequence, length = 0 */ regn->start[iregion] = (iregion ? (regn->start[iregion-1] + regn->length[iregion-1]) : 0); regn->length[iregion] = 0; }else{ if( ibndy > nbndy ){ fprintf(stderr, "More name/code pairs than boundaries\n"); return NULL; } regn->start[iregion] = (iregion ? (regn->start[iregion-1] + regn->length[iregion-1]) : 0); regn->length[iregion] = (ibndy == nbndy ? -1 : (be_int4(bndy[ibndy])-regn->start[iregion])); ibndy++; } } regn->count = 1; hd.p = regn; if (NULL == (hi = HashTableAdd(regn_hash, key, strlen(key), hd, NULL))) { free(regn->region_names); free(regn); return NULL; } } else { regn = (regn_t *)(hi->data.p); regn->count++; } return hi; }
int main(int argc, char **argv) { HashFile *hf; sff_common_header *ch; sff_read_header *rh; int i, dot, arg; char *sff; char hdr[31]; uint64_t index_offset = 0; uint32_t index_size, index_skipped; FILE *fp, *fpout = NULL; int copy_archive = 1; /* process command line arguments of the form -arg */ for (argc--, argv++; argc > 0; argc--, argv++) { if (**argv != '-' || strcmp(*argv, "--") == 0) break; if (strcmp(*argv, "-o") == 0 && argc > 1) { if (NULL == (fpout = fopen(argv[1], "wb+"))) { perror(argv[1]); return 1; } argv++; argc--; } else if (strcmp(*argv, "-t") == 0) { copy_archive = 0; } else if (**argv == '-') { usage(); } } if (argc < 1) usage(); if (copy_archive == 0 && argc != 1) { fprintf(stderr, "-t option only supported with a single sff argument\n"); return 1; } /* Create the hash table */ hf = HashFileCreate(0, HASH_DYNAMIC_SIZE); hf->nheaders = 0; hf->headers = NULL; for (arg = 0; arg < argc; arg++) { /* open (and read) the entire sff file */ sff = argv[arg]; printf("Indexing %s:\n", sff); if (fpout) { if (NULL == (fp = fopen(sff, "rb"))) { perror(sff); return 1; } } else { if (NULL == (fp = fopen(sff, "rb+"))) { perror(sff); return 1; } } /* Read the common header */ ch = fread_sff_common_header(fp); if (ch->index_len && !fpout) { fprintf(stderr, "Archive already contains index.\nReplacing the" " index requires the \"-o outfile\" option.\n"); return 1; } /* Add the SFF common header as a hash file-header */ hf->nheaders++; hf->headers = (HashFileSection *)realloc(hf->headers, hf->nheaders * sizeof(*hf->headers)); hf->headers[hf->nheaders-1].pos = 0; hf->headers[hf->nheaders-1].size = ch->header_len; hf->headers[hf->nheaders-1].cached_data = NULL; /* Read the index items, adding to the hash */ index_skipped = 0; dot = 0; printf(" |\r|"); for (i = 0; i < ch->nreads; i++) { int dlen; uint32_t offset; HashData hd; HashFileItem *hfi; if (i >= dot * (ch->nreads/69)) { putchar('.'); fflush(stdout); dot++; } /* Skip old index if present */ offset = ftell(fp); if (offset == ch->index_offset) { fseek(fp, ch->index_len, SEEK_CUR); index_skipped = ch->index_len; continue; } hfi = (HashFileItem *)calloc(1, sizeof(*hfi)); rh = fread_sff_read_header(fp); dlen = (2*ch->flow_len + 3*rh->nbases + 7) & ~7; fseek(fp, dlen, SEEK_CUR); hfi->header = hf->nheaders; hfi->footer = 0; hfi->pos = offset - index_skipped; hfi->size = (ftell(fp) - index_skipped) - hfi->pos; hd.p = hfi; HashTableAdd(hf->h, rh->name, rh->name_len, hd, NULL); } printf("\n"); HashTableStats(hf->h, stdout); index_offset = ftell(fp) - index_skipped; /* Copy the archive if needed, minus the old index */ if (fpout && copy_archive) { char block[8192]; size_t len; uint64_t pos = 0; printf("\nCopying archive\n"); fseek(fp, 0, SEEK_SET); while (len = fread(block, 1, 8192, fp)) { /* Skip previous index */ if (pos < ch->index_offset && pos+len > ch->index_offset) { len = ch->index_offset - pos; fseek(fp, ch->index_offset + ch->index_len, SEEK_SET); } if (len && len != fwrite(block, 1, len, fpout)) { fprintf(stderr, "Failed to output new archive\n"); return 1; } pos += len; } } if (!fpout) { /* Save the hash */ printf("Saving index\n"); fseek(fp, 0, SEEK_END); index_size = HashFileSave(hf, fp, 0); HashFileDestroy(hf); /* Update the common header */ fseek(fp, 0, SEEK_SET); fread(hdr, 1, 31, fp); *(uint64_t *)(hdr+8) = be_int8(index_offset); *(uint32_t *)(hdr+16) = be_int4(index_size); fseek(fp, 0, SEEK_SET); fwrite(hdr, 1, 31, fp); } fclose(fp); } if (fpout) { /* Save the hash */ printf("Saving index\n"); if (!copy_archive) { hf->archive = strdup(argv[0]); index_offset = 0; } fseek(fpout, 0, SEEK_END); index_size = HashFileSave(hf, fpout, 0); HashFileDestroy(hf); /* Update the common header to indicate index location */ if (copy_archive) { fseek(fpout, 0, SEEK_SET); fread(hdr, 1, 31, fpout); *(uint64_t *)(hdr+8) = be_int8(index_offset); *(uint32_t *)(hdr+16) = be_int4(index_size); fseek(fpout, 0, SEEK_SET); fwrite(hdr, 1, 31, fpout); } fclose(fpout); } return 0; }