/** * \brief Chops the data into stationary segments based on Bayesian change point analysis * * This function splits data into two (and recursively runs on those two segments) if it is found that the odds ratio * for them being from two independent Gaussian distributions is greater than a certain threshold. * * The threshold for the natural logarithm of the odds ratio is empirically set to be * \f[ * T = 4.07 + 1.33\log{}_{10}{N}, * \f] * where \f$N\f$ is the length in samples of the dataset. This is based on Monte Carlo simulations of * many realisations of Gaussian noise for data of different lengths. The threshold comes from a linear * fit to the log odds ratios required to give a 1% chance of splitting Gaussian data (drawn from a single * distribution) for data of various lengths. Note, however, that this relation is not good for stretches of data * with lengths of less than about 30 points, and in fact is rather consevative for such short stretches * of data, i.e. such short stretches of data will require relatively larger odds ratios for splitting than * longer stretches. * * \param data [in] A complex data vector * \param chunkMin [in] The minimum allowed segment length * * \return A vector of segment lengths * * \sa find_change_point */ UINT4Vector *chop_data( gsl_vector_complex *data, UINT4 chunkMin ){ UINT4Vector *chunkIndex = NULL; UINT4 length = (UINT4)data->size; REAL8 logodds = 0.; UINT4 changepoint = 0; REAL8 threshold = 0.; /* may need tuning or setting globally */ chunkIndex = XLALCreateUINT4Vector( 1 ); changepoint = find_change_point( data, &logodds, chunkMin ); /* threshold scaling for a 0.5% false alarm probability of splitting Gaussian data */ threshold = 4.07 + 1.33*log10((REAL8)length); if ( logodds > threshold ){ UINT4Vector *cp1 = NULL; UINT4Vector *cp2 = NULL; gsl_vector_complex_view data1 = gsl_vector_complex_subvector( data, 0, changepoint ); gsl_vector_complex_view data2 = gsl_vector_complex_subvector( data, changepoint, length-changepoint ); UINT4 i = 0, l = 0; cp1 = chop_data( &data1.vector, chunkMin ); cp2 = chop_data( &data2.vector, chunkMin ); l = cp1->length + cp2->length; chunkIndex = XLALResizeUINT4Vector( chunkIndex, l ); /* combine new chunks */ for (i = 0; i < cp1->length; i++) { chunkIndex->data[i] = cp1->data[i]; } for (i = 0; i < cp2->length; i++) { chunkIndex->data[i+cp1->length] = cp2->data[i] + changepoint; } XLALDestroyUINT4Vector( cp1 ); XLALDestroyUINT4Vector( cp2 ); } else{ chunkIndex->data[0] = length; } return chunkIndex; }
/** * \brief Chops and remerges data into stationary segments * * This function finds segments of data that appear to be stationary (have the same standard deviation). * * The function first attempts to chop up the data into as many stationary segments as possible. The splitting may not * be optimal, so it then tries remerging consecutive segments to see if the merged segments show more evidence of * stationarity. <b>[NOTE: Remerging is currently turned off and will make very little difference to the algorithm]</b>. * It then, if necessary, chops the segments again to make sure there are none greater than the required \c chunkMax. * The default \c chunkMax is 0, so this rechopping will not normally happen. * * This is all performed on data that has had a running median subtracted, to try and removed any underlying trends in * the data (e.g. those caused by a strong signal), which might affect the calculations (which assume the data is * Gaussian with zero mean). * * If the \c verbose flag is set then a list of the segments will be output to a file called \c data_segment_list.txt, * with a prefix of the detector name. * * \param data [in] A data structure * \param chunkMin [in] The minimum length of a segment * \param chunkMax [in] The maximum length of a segment * * \return A vector of segment/chunk lengths * * \sa subtract_running_median * \sa chop_data * \sa merge_data * \sa rechop_data */ UINT4Vector *chop_n_merge( LALInferenceIFOData *data, INT4 chunkMin, INT4 chunkMax ){ UINT4 j = 0; UINT4Vector *chunkLengths = NULL; UINT4Vector *chunkIndex = NULL; COMPLEX16Vector *meddata = NULL; /* subtract a running median value from the data to remove any underlying trends (e.g. caused by a string signal) that * might affect the chunk calculations (which can assume the data is Gaussian with zero mean). */ meddata = subtract_running_median( data->compTimeData->data ); /* pass chop data a gsl_vector_view, so that internally it can use vector views rather than having to create new vectors */ gsl_vector_complex_view meddatagsl = gsl_vector_complex_view_array((double*)meddata->data, meddata->length); chunkIndex = chop_data( &meddatagsl.vector, chunkMin ); /* DON'T BOTHER WITH THE MERGING AS IT WILL MAKE VERY LITTLE DIFFERENCE */ /* merge_data( meddata, chunkIndex ); */ /* if a maximum chunk length is defined then rechop up the data, to segment any chunks longer than this value */ if ( chunkMax > chunkMin ) { rechop_data( chunkIndex, chunkMax, chunkMin ); } chunkLengths = XLALCreateUINT4Vector( chunkIndex->length ); /* go through segments and turn into vector of chunk lengths */ for ( j = 0; j < chunkIndex->length; j++ ){ if ( j == 0 ) { chunkLengths->data[j] = chunkIndex->data[j]; } else { chunkLengths->data[j] = chunkIndex->data[j] - chunkIndex->data[j-1]; } } /* if verbose print out the segment end indices to a file */ if ( verbose_output ){ FILE *fpsegs = NULL; CHAR *outfile = NULL; /* set detector name as prefix */ outfile = XLALStringDuplicate( data->detector->frDetector.prefix ); outfile = XLALStringAppend( outfile, "data_segment_list.txt" ); if ( (fpsegs = fopen(outfile, "w")) == NULL ){ fprintf(stderr, "Non-fatal error open file to output segment list.\n"); return chunkLengths; } for ( j = 0; j < chunkIndex->length; j++ ) { fprintf(fpsegs, "%u\n", chunkIndex->data[j]); } /* add space at the end so that you can separate lists from different detector data streams */ fprintf(fpsegs, "\n"); fclose( fpsegs ); } return chunkLengths; }