Beispiel #1
0
/**
 * \brief Chops the data into stationary segments based on Bayesian change point analysis
 *
 * This function splits data into two (and recursively runs on those two segments) if it is found that the odds ratio
 * for them being from two independent Gaussian distributions is greater than a certain threshold.
 *
 * The threshold for the natural logarithm of the odds ratio is empirically set to be
 * \f[
 * T = 4.07 + 1.33\log{}_{10}{N},
 * \f]
 * where \f$N\f$ is the length in samples of the dataset. This is based on Monte Carlo simulations of
 * many realisations of Gaussian noise for data of different lengths. The threshold comes from a linear
 * fit to the log odds ratios required to give a 1% chance of splitting Gaussian data (drawn from a single
 * distribution) for data of various lengths.  Note, however, that this relation is not good for stretches of data
 * with lengths of less than about 30 points, and in fact is rather consevative for such short stretches
 * of data, i.e. such short stretches of data will require relatively larger odds ratios for splitting than
 * longer stretches.
 *
 * \param data [in] A complex data vector
 * \param chunkMin [in] The minimum allowed segment length
 *
 * \return A vector of segment lengths
 *
 * \sa find_change_point
 */
UINT4Vector *chop_data( gsl_vector_complex *data, UINT4 chunkMin ){
  UINT4Vector *chunkIndex = NULL;

  UINT4 length = (UINT4)data->size;

  REAL8 logodds = 0.;
  UINT4 changepoint = 0;

  REAL8 threshold = 0.; /* may need tuning or setting globally */

  chunkIndex = XLALCreateUINT4Vector( 1 );

  changepoint = find_change_point( data, &logodds, chunkMin );

  /* threshold scaling for a 0.5% false alarm probability of splitting Gaussian data */
  threshold = 4.07 + 1.33*log10((REAL8)length);

  if ( logodds > threshold ){
    UINT4Vector *cp1 = NULL;
    UINT4Vector *cp2 = NULL;

    gsl_vector_complex_view data1 = gsl_vector_complex_subvector( data, 0, changepoint );
    gsl_vector_complex_view data2 = gsl_vector_complex_subvector( data, changepoint, length-changepoint );

    UINT4 i = 0, l = 0;

    cp1 = chop_data( &data1.vector, chunkMin );
    cp2 = chop_data( &data2.vector, chunkMin );

    l = cp1->length + cp2->length;

    chunkIndex = XLALResizeUINT4Vector( chunkIndex, l );

    /* combine new chunks */
    for (i = 0; i < cp1->length; i++) { chunkIndex->data[i] = cp1->data[i]; }
    for (i = 0; i < cp2->length; i++) { chunkIndex->data[i+cp1->length] = cp2->data[i] + changepoint; }

    XLALDestroyUINT4Vector( cp1 );
    XLALDestroyUINT4Vector( cp2 );
  }
  else{ chunkIndex->data[0] = length; }

  return chunkIndex;
}
Beispiel #2
0
/**
 * \brief Chops and remerges data into stationary segments
 *
 * This function finds segments of data that appear to be stationary (have the same standard deviation).
 *
 * The function first attempts to chop up the data into as many stationary segments as possible. The splitting may not
 * be optimal, so it then tries remerging consecutive segments to see if the merged segments show more evidence of
 * stationarity. <b>[NOTE: Remerging is currently turned off and will make very little difference to the algorithm]</b>.
 * It then, if necessary, chops the segments again to make sure there are none greater than the required \c chunkMax.
 * The default \c chunkMax is 0, so this rechopping will not normally happen.
 *
 * This is all performed on data that has had a running median subtracted, to try and removed any underlying trends in
 * the data (e.g. those caused by a strong signal), which might affect the calculations (which assume the data is
 * Gaussian with zero mean).
 *
 * If the \c verbose flag is set then a list of the segments will be output to a file called \c data_segment_list.txt,
 * with a prefix of the detector name.
 *
 * \param data [in] A data structure
 * \param chunkMin [in] The minimum length of a segment
 * \param chunkMax [in] The maximum length of a segment
 *
 * \return A vector of segment/chunk lengths
 *
 * \sa subtract_running_median
 * \sa chop_data
 * \sa merge_data
 * \sa rechop_data
 */
UINT4Vector *chop_n_merge( LALInferenceIFOData *data, INT4 chunkMin, INT4 chunkMax ){
  UINT4 j = 0;

  UINT4Vector *chunkLengths = NULL;
  UINT4Vector *chunkIndex = NULL;

  COMPLEX16Vector *meddata = NULL;

  /* subtract a running median value from the data to remove any underlying trends (e.g. caused by a string signal) that
   * might affect the chunk calculations (which can assume the data is Gaussian with zero mean). */
  meddata = subtract_running_median( data->compTimeData->data );

  /* pass chop data a gsl_vector_view, so that internally it can use vector views rather than having to create new vectors */
  gsl_vector_complex_view meddatagsl = gsl_vector_complex_view_array((double*)meddata->data, meddata->length);
  chunkIndex = chop_data( &meddatagsl.vector, chunkMin );

  /* DON'T BOTHER WITH THE MERGING AS IT WILL MAKE VERY LITTLE DIFFERENCE */
  /* merge_data( meddata, chunkIndex ); */

  /* if a maximum chunk length is defined then rechop up the data, to segment any chunks longer than this value */
  if ( chunkMax > chunkMin ) { rechop_data( chunkIndex, chunkMax, chunkMin ); }

  chunkLengths = XLALCreateUINT4Vector( chunkIndex->length );

  /* go through segments and turn into vector of chunk lengths */
  for ( j = 0; j < chunkIndex->length; j++ ){
    if ( j == 0 ) { chunkLengths->data[j] = chunkIndex->data[j]; }
    else { chunkLengths->data[j] = chunkIndex->data[j] - chunkIndex->data[j-1]; }
  }

  /* if verbose print out the segment end indices to a file */
  if ( verbose_output ){
    FILE *fpsegs = NULL;

    CHAR *outfile = NULL;

    /* set detector name as prefix */
    outfile = XLALStringDuplicate( data->detector->frDetector.prefix );

    outfile = XLALStringAppend( outfile, "data_segment_list.txt" );

    if ( (fpsegs = fopen(outfile, "w")) == NULL ){
      fprintf(stderr, "Non-fatal error open file to output segment list.\n");
      return chunkLengths;
    }

    for ( j = 0; j < chunkIndex->length; j++ ) { fprintf(fpsegs, "%u\n", chunkIndex->data[j]); }

    /* add space at the end so that you can separate lists from different detector data streams */
    fprintf(fpsegs, "\n");

    fclose( fpsegs );
  }

  return chunkLengths;
}