PBWT *pbwtSubRange (PBWT *pOld, int start, int end) { int M = pOld->M ; PBWT *pNew = pbwtCreate (M) ; int i, j, k ; uchar *x ; PbwtCursor *uOld = pbwtCursorCreate (pOld, TRUE, TRUE) ; pNew->yz = arrayCreate (pNew->N*8, uchar) ; PbwtCursor *uNew = pbwtCursorCreate (pNew, TRUE, TRUE) ; if (!pOld || !pOld->yz) die ("subrange without an existing pbwt") ; if (start < 0 || end > pOld->N || end <= start) die ("subrange invalid start %d, end %d", start, end) ; x = myalloc (M, uchar) ; if (pOld->sites) pNew->sites = arrayCreate (4096, Site) ; for (i = 0 ; i < end ; ++i) { if (i >= start) { for (j = 0 ; j < M ; ++j) x[uOld->a[j]] = uOld->y[j] ; for (j = 0 ; j < M ; ++j) uNew->y[j] = x[uNew->a[j]] ; pbwtCursorWriteForwards (uNew) ; if (pOld->sites) array(pNew->sites, pNew->N, Site) = arr(pOld->sites, i, Site) ; ++pNew->N ; } pbwtCursorForwardsRead (uOld) ; } pNew->aFend = myalloc (pNew->M, int) ; memcpy (pNew->aFend, uNew->a, pNew->M*sizeof(int)) ; pbwtDestroy (pOld) ; pbwtCursorDestroy (uOld) ; pbwtCursorDestroy (uNew) ; free(x) ; return pNew ; }
void pbwtBuildReverse (PBWT *p) { int i, j, M = p->M ; uchar *x = myalloc (M, uchar) ; PbwtCursor *uF ; if (p->aFend) uF = pbwtCursorCreate (p, TRUE, FALSE) ; else { uF = pbwtCursorCreate (p, TRUE, TRUE) ; for (i = 0 ; i < p->N ; ++i) /* first run forwards to the end */ pbwtCursorForwardsRead (uF) ; pbwtCursorToAFend (uF, p) ; } /* use p->aFend also to start the reverse cursor - this gives better performance */ if (!p->aRstart) p->aRstart = myalloc (M, int) ; memcpy (p->aRstart, uF->a, M * sizeof(int)) ; p->zz = arrayReCreate (p->zz, arrayMax(p->yz), uchar) ; PbwtCursor *uR = pbwtCursorCreate (p, FALSE, TRUE) ; /* will pick up aRstart */ for (i = p->N ; i-- ; ) { pbwtCursorReadBackwards (uF) ; for (j = 0 ; j < M ; ++j) x[uF->a[j]] = uF->y[j] ; for (j = 0 ; j < M ; ++j) uR->y[j] = x[uR->a[j]] ; pbwtCursorWriteForwards (uR) ; } /* save uR->a, which is the lexicographic order of the sequences */ if (!p->aRend) p->aRend = myalloc (M, int) ; memcpy (p->aRend, uR->a, M * sizeof(int)) ; fprintf (logFile, "built reverse PBWT - size %ld\n", arrayMax(p->zz)) ; if (isCheck) /* print out the reversed haplotypes */ { FILE *fp = fopen ("rev.haps","w") ; Array tz = p->yz ; p->yz = p->zz ; int* ta = p->aFstart ; p->aFstart = p->aRstart ; pbwtWriteHaplotypes (fp, p) ; p->yz = tz ; p->aFstart = ta ; } free (x) ; pbwtCursorDestroy (uF) ; pbwtCursorDestroy (uR) ; }
PBWT *pbwtSubSites (PBWT *pOld, double fmin, double frac) { int M = pOld->M ; PBWT *pNew = pbwtCreate (M, 0) ; int i, j, k, thresh = M*(1-fmin) ; double bit = 0.0 ; uchar *x ; PbwtCursor *uOld = pbwtCursorCreate (pOld, TRUE, TRUE) ; PbwtCursor *uNew = pbwtCursorCreate (pNew, TRUE, TRUE) ; if (!pOld || !pOld->yz) die ("subsites without an existing pbwt") ; if (fmin < 0 || fmin >= 1 || frac <= 0 || frac > 1) die ("fmin %f, frac %f for subsites out of range\n", fmin, frac) ; x = myalloc (M, uchar) ; if (pOld->sites) pNew->sites = arrayCreate (4096, Site) ; for (i = 0 ; i < pOld->N ; ++i) { if ((uOld->c < thresh) && ((bit += frac) > 1.0)) { for (j = 0 ; j < M ; ++j) x[uOld->a[j]] = uOld->y[j] ; for (j = 0 ; j < M ; ++j) uNew->y[j] = x[uNew->a[j]] ; pbwtCursorWriteForwards (uNew) ; if (pOld->sites) array(pNew->sites, pNew->N, Site) = arr(pOld->sites, i, Site) ; ++pNew->N ; bit -= 1.0 ; } pbwtCursorForwardsRead (uOld) ; } pbwtCursorToAFend (uNew, pNew) ; fprintf (logFile, "subsites with fmin %f, frac %f leaves %d sites\n", fmin, frac, pNew->N) ; pNew->chrom = pOld->chrom ; pOld->chrom = 0 ; pNew->samples = pOld->samples ; pOld->samples = 0 ; pNew->missingOffset = pOld->missingOffset ; pOld->missingOffset = 0 ; pNew->zMissing = pOld->zMissing ; pOld->zMissing = 0 ; pbwtDestroy (pOld) ; pbwtCursorDestroy (uOld) ; pbwtCursorDestroy (uNew) ; free(x) ; return pNew ; }
static void pbwt_reader_destroy(pbwt_reader_t *reader) { int i; for (i=0; i<reader->n; i++) { pbwtDestroy(reader->pbwt[i]); pbwtCursorDestroy(reader->cursor[i]); } free(reader->pbwt); free(reader->cursor); free(reader->unpacked); free(reader->cpos); free(reader); }
PBWT *pbwtSubRange (PBWT *pOld, int start, int end) { int M = pOld->M ; PBWT *pNew = pbwtCreate (M, 0) ; int i, j, k ; uchar *x ; PbwtCursor *uOld = pbwtCursorCreate (pOld, TRUE, TRUE) ; PbwtCursor *uNew = pbwtCursorCreate (pNew, TRUE, TRUE) ; if (!pOld || !pOld->yz) die ("subrange without an existing pbwt") ; if (start < 0 || end > pOld->N || end <= start) die ("subrange invalid start %d, end %d", start, end) ; x = myalloc (M, uchar) ; if (pOld->sites) pNew->sites = arrayCreate (4096, Site) ; for (i = 0 ; i < end ; ++i) { if (i >= start) { for (j = 0 ; j < M ; ++j) x[uOld->a[j]] = uOld->y[j] ; for (j = 0 ; j < M ; ++j) uNew->y[j] = x[uNew->a[j]] ; pbwtCursorWriteForwards (uNew) ; if (pOld->sites) array(pNew->sites, pNew->N, Site) = arr(pOld->sites, i, Site) ; ++pNew->N ; } pbwtCursorForwardsRead (uOld) ; } pbwtCursorToAFend (uNew, pNew) ; pNew->chrom = pOld->chrom ; pOld->chrom = 0 ; pNew->samples = pOld->samples ; pOld->samples = 0 ; pNew->missingOffset = pOld->missingOffset ; pOld->missingOffset = 0 ; pNew->zMissing = pOld->zMissing ; pOld->zMissing = 0 ; pbwtDestroy (pOld) ; pbwtCursorDestroy (uOld) ; pbwtCursorDestroy (uNew) ; free(x) ; return pNew ; }
void pbwtShapeItWithMiss (PBWT *p, FILE *out) { if (!p || !p->yz) die ("option -longWithin called without a PBWT") ; /******** ref *****/ uchar **reference = pbwtHaplotypes (p) ; /* haplotypes for reference (M * N) */ /*********************/ uchar *x; /* use for current query */ PbwtCursor *up = pbwtCursorCreate (p, TRUE, TRUE) ; int **u ; /* stored indexes */ int i, j, k, N = p->N, M = p->M ; int num_1; /* for the num of heterozyogous */ int s, seg_num; /* for the segment number and current segment */ /* build indexes */ u = myalloc (N,int*) ; for (i = 0 ; i < N ; ++i) u[i] = myalloc (p->M+1, int) ; x = myalloc (N, uchar) ; int *cc = myalloc (p->N, int) ; /* make pbwt index */ for (k = 0 ; k < N ; ++k) { cc[k] = up->c ; pbwtCursorCalculateU (up) ; memcpy (u[k], up->u, (M+1)*sizeof(int)) ; pbwtCursorForwardsReadAD (up, k) ; } int time = 150; int **geno; geno = myalloc(time, int*); for (i = 0; i < time; ++i) geno[i] = myalloc (p->N, int); for (j = 0; j < time; ++j) { for (i = 0; i < p->N; ++i) { geno[j][i] = reference[j * 2][i] + reference[j * 2 + 1][i]; } } //clean up pbwtCursorDestroy (up) ; for (j = 0 ; j < M ; ++j) free(reference[j]) ; free (reference) ; fprintf (stderr, "Made indices: \n") ; timeUpdate (); int *pos; /* record the heterozyogous position */ pos = myalloc (N, int) ; for (int t = 0; t < time; ++t) { num_1 = 0; /* for the num of heterozyogous */ s = 0; seg_num = 1; /* for the segment number and current segment */ /* find the heterozyogous position and record */ for ( i = 0, j = 0; i < N; ++i) { if (geno[t][i] == 1) { ++j; pos[num_1++] = i; if (j == 3) { ++seg_num; j = 0; } } } int start, depth; int *het = myalloc(6, int); fprintf (stderr, "seg_num %d \n", seg_num); for (s = 0; s < seg_num - 2; ++s) { Tables *tables = 0; uchar *seq; if (!s) start = 0; else start = pos[s * 3 - 1] + 1; for (i = 0; i < 6; ++i) { het[i] = pos[s * 3 + i]; } depth = het[5] - start + 1; seq = myalloc(depth + 1, uchar); memset(seq, '2', depth * sizeof(uchar)); seq[depth] = '\0'; tables = tablesCreate(500); extendMatch(het, start, 0, depth, seq, cc, u, 0, M, &tables); //fprintf (stderr, "display s = %d, depth = %d \t table size = %d\n", s, depth, tables->num); //tablesDisplay(tables); free(seq); tablesDestroy(tables); } free(het); } fprintf (stderr, "finished \n") ; timeUpdate (); /* cleanup */ free(x); free(pos); free(cc); for (j = 0 ; j < N ; ++j) free(u[j]) ; free (u) ; for (j = 0 ; j < time; ++j) free(geno[j]) ; free (geno); }
PBWT *pbwtMerge(const char **fnames, int nfiles) { pbwt_reader_t *reader = pbwt_reader_init(fnames, nfiles); int nhaps = 0, i; for (i=0; i<nfiles; i++) nhaps += reader->pbwt[i]->M; PBWT *out_pbwt = pbwtCreate(nhaps, 0); PbwtCursor *cursor = pbwtNakedCursorCreate(nhaps, 0); uchar *yseq = myalloc(nhaps, uchar); out_pbwt->yz = arrayCreate (1<<20, uchar) ; out_pbwt->sites = arrayReCreate(out_pbwt->sites, reader->pbwt[0]->N, Site); out_pbwt->chrom = strdup(reader->pbwt[0]->chrom); int pos, j; while ( (pos=pbwt_reader_next(reader, nfiles)) ) { // Merge only records shared by all files for (i=0; i<nfiles; i++) { PBWT *p = reader->pbwt[i]; Site *site = arrp(p->sites, reader->cpos[i], Site); // Both position and alleles must match. This requires that the records are sorted by alleles. if ( site->x!=pos ) break; char *als = dictName(variationDict, site->varD); if ( strcmp(als,reader->mals) ) break; } if ( i!=nfiles ) { // intersection: skip records which are not present in all files for (i=0; i<nfiles; i++) { PBWT *p = reader->pbwt[i]; Site *site = arrp(p->sites, reader->cpos[i], Site); if ( site->x!=pos ) continue; char *als = dictName(variationDict, site->varD); if ( strcmp(als,reader->mals) ) continue; PbwtCursor *c = reader->cursor[i]; reader->unpacked[i] += unpack3(arrp(p->yz,reader->unpacked[i],uchar), p->M, c->y, 0); pbwtCursorForwardsA(c); } continue; } // read and merge int ihap = 0; for (i=0; i<nfiles; i++) { PbwtCursor *c = reader->cursor[i]; PBWT *p = reader->pbwt[i]; Site *site = arrp(p->sites, reader->cpos[i], Site); reader->unpacked[i] += unpack3(arrp(p->yz,reader->unpacked[i],uchar), p->M, c->y, 0); for (j=0; j<p->M; j++) yseq[ihap + c->a[j]] = c->y[j]; pbwtCursorForwardsA(c); ihap += p->M; } // pack merged haplotypes for (j=0; j<nhaps; j++) cursor->y[j] = yseq[cursor->a[j]]; pack3arrayAdd(cursor->y, out_pbwt->M, out_pbwt->yz); pbwtCursorForwardsA(cursor); // insert new site arrayExtend(out_pbwt->sites, out_pbwt->N+1); Site *site = arrayp(out_pbwt->sites, out_pbwt->N, Site); site->x = pos; dictAdd(variationDict, reader->mals, &site->varD); out_pbwt->N++; } pbwtCursorToAFend (cursor, out_pbwt) ; free(yseq); pbwtCursorDestroy(cursor); pbwt_reader_destroy(reader); return out_pbwt; }