void pw_ccp(PwParam *p, Mapfile *mapfile, int chr, int chrlen){ int i; printf("Making cross-correlation profile...\n"); TYPE_WIGARRAY *plus = chrarray_new(mapfile, chr, chrlen, STRAND_PLUS); TYPE_WIGARRAY *minus = chrarray_new(mapfile, chr, chrlen, STRAND_MINUS); TYPE_WIGARRAY qnt99 = calc_qnt(plus, chrlen, 0.99); for(i=0; i<chrlen; i++){ if(plus[i] > qnt99) plus[i] = qnt99; if(minus[i] > qnt99) minus[i] = qnt99; } char *outputfile = alloc_str_new(p->output_dir, strlen(p->output_prefix) +100); sprintf(outputfile, "%s/%s.ccp.xls", p->output_dir, p->output_prefix); FILE *OUT = my_fopen(outputfile, FILE_MODE_WRITE); fprintf(OUT, "strand-shift\tcross-correlation\n"); double cc=0; int start=1000; int num=chrlen-2500; for(i=-500; i<1500; i+=5){ cc = calc_corr(plus + start, minus + start + i, num); fprintf(OUT, "%d\t%f\n", i, cc); } fclose(OUT); printf("Output to %s.\n",outputfile); MYFREE(outputfile); MYFREE(plus); MYFREE(minus); return; }
/* this is the main loop for the correlation type functions * fx and nx are file pointers to things like read_first_x and * read_next_x */ int corr_loop(t_corr *curr, const char *fn, t_topology *top, int ePBC, gmx_bool bMol, int gnx[], atom_id *index[], t_calc_func *calc1, gmx_bool bTen, int *gnx_com, atom_id *index_com[], real dt, real t_pdb, rvec **x_pdb, matrix box_pdb, const output_env_t oenv) { rvec *x[2]; /* the coordinates to read */ rvec *xa[2]; /* the coordinates to calculate displacements for */ rvec com = {0}; real t, t_prev = 0; int natoms, i, j, cur = 0, maxframes = 0; t_trxstatus *status; #define prev (1-cur) matrix box; gmx_bool bFirst; gmx_rmpbc_t gpbc = NULL; natoms = read_first_x(oenv, &status, fn, &curr->t0, &(x[cur]), box); #ifdef DEBUG fprintf(stderr, "Read %d atoms for first frame\n", natoms); #endif if ((gnx_com != NULL) && natoms < top->atoms.nr) { fprintf(stderr, "WARNING: The trajectory only contains part of the system (%d of %d atoms) and therefore the COM motion of only this part of the system will be removed\n", natoms, top->atoms.nr); } snew(x[prev], natoms); if (bMol) { curr->ncoords = curr->nmol; snew(xa[0], curr->ncoords); snew(xa[1], curr->ncoords); } else { curr->ncoords = natoms; xa[0] = x[0]; xa[1] = x[1]; } bFirst = TRUE; t = curr->t0; if (x_pdb) { *x_pdb = NULL; } if (bMol) { gpbc = gmx_rmpbc_init(&top->idef, ePBC, natoms); } /* the loop over all frames */ do { if (x_pdb && ((bFirst && t_pdb < t) || (!bFirst && t_pdb > t - 0.5*(t - t_prev) && t_pdb < t + 0.5*(t - t_prev)))) { if (*x_pdb == NULL) { snew(*x_pdb, natoms); } for (i = 0; i < natoms; i++) { copy_rvec(x[cur][i], (*x_pdb)[i]); } copy_mat(box, box_pdb); } /* check whether we've reached a restart point */ if (bRmod(t, curr->t0, dt)) { curr->nrestart++; srenew(curr->x0, curr->nrestart); snew(curr->x0[curr->nrestart-1], curr->ncoords); srenew(curr->com, curr->nrestart); srenew(curr->n_offs, curr->nrestart); srenew(curr->lsq, curr->nrestart); snew(curr->lsq[curr->nrestart-1], curr->nmol); for (i = 0; i < curr->nmol; i++) { curr->lsq[curr->nrestart-1][i] = gmx_stats_init(); } if (debug) { fprintf(debug, "Extended data structures because of new restart %d\n", curr->nrestart); } } /* create or extend the frame-based arrays */ if (curr->nframes >= maxframes-1) { if (maxframes == 0) { for (i = 0; (i < curr->ngrp); i++) { curr->ndata[i] = NULL; curr->data[i] = NULL; if (bTen) { curr->datam[i] = NULL; } } curr->time = NULL; } maxframes += 10; for (i = 0; (i < curr->ngrp); i++) { srenew(curr->ndata[i], maxframes); srenew(curr->data[i], maxframes); if (bTen) { srenew(curr->datam[i], maxframes); } for (j = maxframes-10; j < maxframes; j++) { curr->ndata[i][j] = 0; curr->data[i][j] = 0; if (bTen) { clear_mat(curr->datam[i][j]); } } } srenew(curr->time, maxframes); } /* set the time */ curr->time[curr->nframes] = t - curr->t0; /* for the first frame, the previous frame is a copy of the first frame */ if (bFirst) { std::memcpy(xa[prev], xa[cur], curr->ncoords*sizeof(xa[prev][0])); bFirst = FALSE; } /* make the molecules whole */ if (bMol) { gmx_rmpbc(gpbc, natoms, box, x[cur]); } /* calculate the molecules' centers of masses and put them into xa */ if (bMol) { calc_mol_com(gnx[0], index[0], &top->mols, &top->atoms, x[cur], xa[cur]); } /* first remove the periodic boundary condition crossings */ for (i = 0; i < curr->ngrp; i++) { prep_data(bMol, gnx[i], index[i], xa[cur], xa[prev], box); } /* calculate the center of mass */ if (gnx_com) { prep_data(bMol, gnx_com[0], index_com[0], xa[cur], xa[prev], box); calc_com(bMol, gnx_com[0], index_com[0], xa[cur], xa[prev], box, &top->atoms, com); } /* loop over all groups in index file */ for (i = 0; (i < curr->ngrp); i++) { /* calculate something useful, like mean square displacements */ calc_corr(curr, i, gnx[i], index[i], xa[cur], (gnx_com != NULL), com, calc1, bTen); } cur = prev; t_prev = t; curr->nframes++; } while (read_next_x(oenv, status, &t, x[cur], box)); fprintf(stderr, "\nUsed %d restart points spaced %g %s over %g %s\n\n", curr->nrestart, output_env_conv_time(oenv, dt), output_env_get_time_unit(oenv), output_env_conv_time(oenv, curr->time[curr->nframes-1]), output_env_get_time_unit(oenv) ); if (bMol) { gmx_rmpbc_done(gpbc); } close_trj(status); return natoms; }
void silk_warped_autocorrelation_FIX_neon( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *input, /* I Input data to correlate */ const opus_int warping_Q16, /* I Warping coefficient */ const opus_int length, /* I Length of input */ const opus_int order /* I Correlation order (even) */ ) { if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) { silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order ); } else { opus_int n, i, lsh; opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */ opus_int64 corr_QC_orderT; int64x2_t lsh_s64x2; const opus_int orderT = ( order + 3 ) & ~3; opus_int64 *corr_QCT; opus_int32 *input_QS; VARDECL( opus_int32, input_QST ); VARDECL( opus_int32, state ); SAVE_STACK; /* Order must be even */ silk_assert( ( order & 1 ) == 0 ); silk_assert( 2 * QS - QC >= 0 ); ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 ); input_QS = input_QST; /* input_QS has zero paddings in the beginning and end. */ vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; /* Loop over samples */ for( n = 0; n < length - 7; n += 8, input_QS += 8 ) { const int16x8_t t0_s16x4 = vld1q_s16( input + n ); vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) ); vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) ); } for( ; n < length; n++, input_QS++ ) { input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS ); } vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT; /* The following loop runs ( length + order ) times, with ( order ) extra epilogues. */ /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues. */ /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */ /* Keep the C code here to help understand the intrinsics optimization. */ /* { opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; opus_int32 *state_QST[ 3 ]; state_QST[ 0 ] = state_QS[ 0 ]; state_QST[ 1 ] = state_QS[ 1 ]; for( n = 0; n < length + order; n++, input_QS++ ) { state_QST[ 0 ][ orderT ] = input_QS[ orderT ]; for( i = 0; i < orderT; i++ ) { corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC ); state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 ); } state_QST[ 2 ] = state_QST[ 0 ]; state_QST[ 0 ] = state_QST[ 1 ]; state_QST[ 1 ] = state_QST[ 2 ]; } } */ { const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 ); const opus_int32 *in = input_QS + orderT; opus_int o = orderT; int32x4_t state_QS_s32x4[ 3 ][ 2 ]; ALLOC( state, length + orderT, opus_int32 ); state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 ); /* Calculate 8 taps of all inputs in each loop. */ do { state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] = state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 ); n = 0; do { calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] ); calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] ); state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n ); vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 ); state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ]; } while( ++n < ( length + order ) ); in = state; o -= 8; } while( o > 4 ); if( o ) { /* Calculate the last 4 taps of all inputs. */ opus_int32 *stateT = state; silk_assert( o == 4 ); state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 ); n = length + order; do { calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] ); state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT ); vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; input_QS++; stateT++; } while( --n ); } } { const opus_int16 *inputT = input; int32x4_t t_s32x4; int64x1_t t_s64x1; int64x2_t t_s64x2 = vdupq_n_s64( 0 ); for( n = 0; n <= length - 8; n += 8 ) { int16x8_t input_s16x8 = vld1q_s16( inputT ); t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) ); t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) ); inputT += 8; } t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) ); corr_QC_orderT = vget_lane_s64( t_s64x1, 0 ); for( ; n < length; n++ ) { corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] ); } corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC ); corr_QC[ orderT ] = corr_QC_orderT; } corr_QCT = corr_QC + orderT - order; lsh = silk_CLZ64( corr_QC_orderT ) - 35; lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); *scale = -( QC + lsh ); silk_assert( *scale >= -30 && *scale <= 12 ); lsh_s64x2 = vdupq_n_s64( lsh ); for( i = 0; i <= order - 3; i += 4 ) { int32x4_t corr_s32x4; int64x2_t corr_QC0_s64x2, corr_QC1_s64x2; corr_QC0_s64x2 = vld1q_s64( corr_QCT + i ); corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 ); corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 ); corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 ); corr_s32x4 = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) ); corr_s32x4 = vrev64q_s32( corr_s32x4 ); vst1q_s32( corr + order - i - 3, corr_s32x4 ); } if( lsh >= 0 ) { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) ); } } else { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) ); } } silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/ RESTORE_STACK; } #ifdef OPUS_CHECK_ASM { opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ]; opus_int scale_c; silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order ); silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) ); silk_assert( scale_c == *scale ); } #endif }