예제 #1
0
int main(){
    apop_data *d = apop_text_alloc(apop_data_alloc(6), 6, 1);
    apop_data_fill(d,   1,   2,   3,   3,   1,   2);
    apop_text_fill(d,  "A", "A", "A", "A", "A", "B");

    asprintf(&d->names->title, "Original data set");
    printdata(d);

        //binned, where bin ends are equidistant but not necessarily in the data
    apop_data *binned = apop_data_to_bins(d, NULL);
    asprintf(&binned->names->title, "Post binning");
    printdata(binned);
    assert(apop_sum(binned->weights)==6);
    assert(fabs(//equal distance between bins
              (apop_data_get(binned, 1, -1) - apop_data_get(binned, 0, -1))
            - (apop_data_get(binned, 2, -1) - apop_data_get(binned, 1, -1))) < 1e-5);

        //compressed, where the data is as in the original, but weights 
        //are redome to accommodate repeated observations.
    apop_data_pmf_compress(d);
    asprintf(&d->names->title, "Post compression");
    printdata(d);
    assert(apop_sum(d->weights)==6);

    apop_model *d_as_pmf = apop_estimate(d, apop_pmf);
    Apop_row(d, 0, firstrow); //1A
    assert(fabs(apop_p(firstrow, d_as_pmf) - 2./6 < 1e-5));
}
예제 #2
0
apop_data *get_factors(SEXP ls, char const *varname){
    int nls = LENGTH(ls);
    if (isNull(ls)) return NULL;
    //else:
    apop_data *out = apop_text_alloc(NULL, nls, 1);
    for (int i = 0; i < nls; i++) 
        apop_text_add(out, i, 0, translateChar(STRING_ELT(ls, i)));
    asprintf(&out->names->title, "<categories for %s>", varname);
    apop_data_show(out);
    return out;
}
예제 #3
0
파일: factors.c 프로젝트: b-k/apophenia
int main(){
    apop_data *d1 = apop_text_alloc(NULL, 5, 1);
    apop_data *d2 = apop_text_alloc(NULL, 5, 1);
    apop_text_fill(d1, "A", "B", "C", "B", "B");
    apop_text_fill(d2, "B", "A", "D", "B", "B");
    apop_data_to_factors(d1);
    apop_data_show(d1);
    d2->more = apop_data_copy(apop_data_get_factor_names(d1, 0, 't'));
    printf("-----\n");
    apop_data_to_dummies(d2, .append='y');
    apop_data_show(d2);


    //some spot checks.
    assert(apop_data_get(d1, 2)==2);
    assert(apop_data_get(d2, 0, 0)==1);
    assert(apop_data_get(d2, 2, 0)==0);
    assert(apop_data_get(d2, 2, 1)==0);
    assert(apop_data_get(d2, 2, 2)==1);
    assert(apop_data_get(d2, 3, 0)==1);
}
예제 #4
0
/** A convenience function for regular expression searching 

\li There are three common flavors of regular expression: Basic, Extended,
and Perl-compatible (BRE, ERE, PCRE). I use EREs, as per the specs of
your C library, which should match POSIX's ERE specification. 

For example, "p.val" will match "P value", "p.value", "p values" (and even "tempeval", so be
careful).

If you give a non-\c NULL address in which to place a table of paren-delimited substrings, I'll return them as a row in the text element of the returned \ref apop_data set. I'll return <em>all</em> the matches, filling the first row with substrings from the first application of your regex, then filling the next row with another set of matches (if any), and so on to the end of the string. Useful when parsing a list of items, for example.


\param string        The string to search (no default)
\param regex       The regular expression (no default)
\param substrings   Parens in the regex indicate that I should return matching substrings. Give me the _address_ of an \ref apop_data* set, and I will allocate and fill the text portion with matches. Default= \c NULL, meaning do not return substrings (even if parens exist in the regex). If no match, return an empty \ref apop_data set, so <tt>output->textsize[0]==0</tt>.
\param use_case         Should I be case sensitive, \c 'y' or \c 'n'? (default = \c 'n', which is not the POSIX default.)

\return         Count of matches found. 0 == no match. \c substrings may be allocated and filled if needed.
\ingroup names


\li If <tt>apop_opts.stop_on_warning='n'</tt> returns -1 on error (e.g., regex \c NULL or didn't compile).
\li If <tt>strings==NULL</tt>, I return 0---no match---and if \c substrings is provided, set it to \c NULL.

\li Here is the test function. Notice that the substring-pulling
function call passes \c &subs, not plain \c subs. Also, the non-match
has a zero-length blank in <tt>subs->text[0][1]</tt>.

\include test_regex.c

\li Each set of matches will be one row of the output data. E.g., given the regex <tt>([A-Za-z])([0-9])</tt>, the column zero of <tt>outdata</tt> will hold letters, and column one will hold numbers.
Use \ref apop_data_transpose to reverse this so that the letters are in <tt>outdata->text[0]</tt> and numbers in <tt>outdata->text[1]</tt>.
*/
APOP_VAR_HEAD int  apop_regex(const char *string, const char* regex, apop_data **substrings, const char use_case){
    const char * apop_varad_var(string, NULL);
    apop_data **apop_varad_var(substrings, NULL);
    if (!string) {
        if (substrings) *substrings=NULL;
        return 0;
    }
    const char * apop_varad_var(regex, NULL);
    Apop_assert_negone(regex, "You gave me a NULL regex.");
    const char apop_varad_var(use_case, 'n');
APOP_VAR_ENDHEAD
    regex_t re;
    int matchcount=count_parens(regex);
    int found, found_ct=0;
    regmatch_t result[matchcount];
    int compiled_ok = !regcomp(&re, regex, REG_EXTENDED 
                                            + (use_case=='y' ? 0 : REG_ICASE)
                                            + (substrings ? 0 : REG_NOSUB) );
    Apop_stopif(!compiled_ok, return -1, 0, "This regular expression didn't compile: \"%s\"", regex)

    int matchrow = 0;
    if (substrings) *substrings = apop_data_alloc();
    do {
        found_ct+=
        found    = !regexec(&re, string, matchcount+1, result, matchrow ? REG_NOTBOL : 0);
        if (substrings && found){
            *substrings = apop_text_alloc(*substrings, matchrow+1, matchcount);
            //match zero is the whole string; ignore.
            for (int i=0; i< matchcount; i++){
                if (result[i+1].rm_eo > 0){//GNU peculiarity: match-to-empty marked with -1.
                    int length_of_match = result[i+1].rm_eo - result[i+1].rm_so;
                    free((*substrings)->text[matchrow][i]);
                    (*substrings)->text[matchrow][i] = malloc(strlen(string)+1);
                    memcpy((*substrings)->text[matchrow][i], string + result[i+1].rm_so, length_of_match);
                    (*substrings)->text[matchrow][i][length_of_match] = '\0';
                } //else matches nothing; apop_text_alloc already made this cell this NULL.
            }
            string += result[0].rm_eo; //end of whole match;
            matchrow++;
        }
    } while (substrings && found && string[0]!='\0');
    regfree(&re);
    return found_ct;
}
예제 #5
0
apop_data *apop_data_from_frame(SEXP in){
    apop_data *out;
    if (TYPEOF(in)==NILSXP) return NULL;

    PROTECT(in);
    assert(TYPEOF(in)==VECSXP); //I should write a check for this on the R side.
    int total_cols=LENGTH(in);
    int total_rows=LENGTH(VECTOR_ELT(in,0));
    int char_cols = 0;
    for (int i=0; i< total_cols; i++){
        SEXP this_col = VECTOR_ELT(in, i);
        char_cols += (TYPEOF(this_col)==STRSXP);
    }
    SEXP rl, cl;
    //const char *rn, *cn;
    //GetMatrixDimnames(in, &rl, &cl, &rn, &cn);
    PROTECT(cl = getAttrib(in, R_NamesSymbol));
    PROTECT(rl = getAttrib(in, R_RowNamesSymbol));

    int current_numeric_col=0, current_text_col=0, found_vector=0;

    if(cl !=R_NilValue && TYPEOF(cl)==STRSXP) //just check for now.
        for (int ndx=0; ndx < LENGTH(cl) && !found_vector; ndx++)
            if (!strcmp(translateChar(STRING_ELT(cl, ndx)), "Vector")) found_vector++;

    int matrix_cols= total_cols-char_cols-found_vector;
    out= apop_data_alloc((found_vector?total_rows:0), (matrix_cols?total_rows:0),  matrix_cols);
    if (char_cols) out=apop_text_alloc(out, total_rows, char_cols);

    if(rl !=R_NilValue)
        for (int ndx=0; ndx < LENGTH(rl); ndx++)
            if (TYPEOF(rl)==STRSXP)
                apop_name_add(out->names, translateChar(STRING_ELT(rl, ndx)), 'r');
            else //let us guess that it's a numeric list and hope the R Project one day documents this stuff.
                {char *ss; asprintf(&ss, "%i", ndx); apop_name_add(out->names, ss, 'r'); free(ss);}

    for (int i=0; i< total_cols; i++){
        const char *colname = NULL;
        if(cl !=R_NilValue)
            colname = translateChar(STRING_ELT(cl, i));
        SEXP this_col = VECTOR_ELT(in, i);
        if (TYPEOF(this_col) == STRSXP){
            //could this be via aliases instead of copying?
            //printf("col %i is chars\n", i);
            if(colname) apop_name_add(out->names, colname, 't');
            for (int j=0; j< total_rows; j++)
                apop_text_add(out, j, current_text_col,
				(STRING_ELT(this_col,j)==NA_STRING ? apop_opts.nan_string : translateChar(STRING_ELT(this_col, j))));
            current_text_col++;
            continue;
        } else {    //plain old matrix data.
            int col_in_question = current_numeric_col;
            if (colname && !strcmp(colname, "Vector")) {
                out->vector = gsl_vector_alloc(total_rows);
                col_in_question = -1;
            } else {current_numeric_col++;}
            Apop_col_v(out, col_in_question, onecol);
            if (TYPEOF(this_col) == INTSXP){
                //printf("col %i is ints\n", i);
                int *vals = INTEGER(this_col);
                for (int j=0; j< onecol->size; j++){
					//printf("%i\n",vals[j]);
                    gsl_vector_set(onecol, j, (vals[j]==NA_INTEGER ? GSL_NAN : vals[j]));
					}
            } else {
                double *vals = REAL(this_col);
                for (int j=0; j< onecol->size; j++)
                    gsl_vector_set(onecol, j, (ISNAN(vals[j])||ISNA(vals[j]) ? GSL_NAN : vals[j]));
            }
            if(colname && col_in_question > -1) apop_name_add(out->names, colname, 'c');
            else apop_name_add(out->names, colname, 'v'); //which is "vector".
        }
        //Factors
        SEXP ls = getAttrib(this_col, R_LevelsSymbol);
        if (ls){
            apop_data *end;//find last page for adding factors.
            for(end=out; end->more!=NULL; end=end->more);
            end->more = get_factors(ls, colname);
        }
    }
    UNPROTECT(3);
    return out;
}
예제 #6
0
/* This function prettyprints the \c apop_data set to a screen.

It is currently not in the documentation. It'd be nice to merge this w/apop_data_print.

This takes a lot of machinery. I write every last element to a text array, then measure column widths, then print to screen with padding to guarantee that everything lines up.  There's no way to have the first element of a column line up with the last unless you interrogate the width of every element in the column, so printing columns really can't be a one-pass process.

So, I produce an \ref apop_data set with no numeric elements and a text element to be
filled with the input data set, and then print that. That means that I'll be using
(more than) twice the memory to print this. If this is a problem, you can use \ref
apop_data_print to dump your data to a text file, and view the text file, or print
subsets.

For more machine-readable printing, see \ref apop_data_print.
*/
void apop_data_show(const apop_data *in){
    if (!in) {printf("NULL\n"); return;}
    Get_vmsizes(in) //vsize, msize1, msize2, tsize
//Take inventory and get sizes
    size_t hasrownames = (in->names && in->names->rowct) ? 1 : 0;
    size_t hascolnames = in->names && 
                    (in->names->vector || in->names->colct || in->names->textct);
    size_t hasweights = (in->weights != NULL);

    size_t outsize_r = GSL_MAX(in->matrix ? in->matrix->size1 : 0, in->vector ? in->vector->size: 0);
    outsize_r = GSL_MAX(outsize_r, in->textsize[0]);
    outsize_r = GSL_MAX(outsize_r, wsize);
    if (in->names) outsize_r = GSL_MAX(outsize_r, in->names->rowct);
    outsize_r += hascolnames;

    size_t outsize_c = msize2;
    outsize_c += in->textsize[1];
    outsize_c += (vsize>0);
    outsize_c += (wsize>0);
    outsize_c += hasrownames + hasweights;

//Write to the printout data set.
    apop_data *printout = apop_text_alloc(NULL , outsize_r, outsize_c);
    if (hasrownames)
        for (size_t i=0; i < in->names->rowct; i ++)
            apop_text_set(printout, i + hascolnames, 0, "%s", in->names->row[i]);
    for (size_t i=0; i < vsize; i ++) //vsize may be zero.
        apop_text_set(printout, i + hascolnames, hasrownames, "%g", gsl_vector_get(in->vector, i));
    for (size_t i=0; i < msize1; i ++) //msize1 may be zero.
        for (size_t j=0; j < msize2; j ++)
            apop_text_set(printout, i + hascolnames, hasrownames + (vsize >0)+ j, "%g", gsl_matrix_get(in->matrix, i, j));
    if (in->textsize[0])
        for (size_t i=0; i < in->textsize[0]; i ++)
            for (size_t j=0; j < in->textsize[1]; j ++)
                apop_text_set(printout, i + hascolnames, hasrownames + (vsize>0)+ msize2 + j, "%s", in->text[i][j]);
    if (hasweights)
        for (size_t i=0; i < in->weights->size; i ++)
            apop_text_set(printout, i + hascolnames, outsize_c-1, "%g", gsl_vector_get(in->weights, i));

//column names
    if (hascolnames){
        if (vsize && in->names->vector)
            apop_text_set(printout, 0 , hasrownames, "%s", in->names->vector);
        if (msize2 && in->names)
            for (size_t i=0; i < in->names->colct; i ++)
                apop_text_set(printout, 0 , hasrownames + (vsize>0) + i, "%s", in->names->col[i]);
        if (in->textsize[1] && in->names)
            for (size_t i=0; i < in->names->textct; i ++)
                apop_text_set(printout, 0 , hasrownames + (vsize>0) + msize2 + i, "%s", in->names->text[i]);
        if (hasweights)
            apop_text_set(printout, 0 , outsize_c-1, "Weights");
    }

//get column sizes
    int colsizes[outsize_c];
    for (size_t i=0; i < outsize_c; i ++){
        colsizes[i] = strlen(printout->text[0][i]);
        for (size_t j=1; j < outsize_r; j ++)
            colsizes[i] = GSL_MAX(colsizes[i], strlen(printout->text[j][i]));
    }

//Finally, print
    if (in->names && in->names->title && strlen(in->names->title))
        printf("\t%s\n\n", in->names->title);
    for (size_t j=0; j < outsize_r; j ++){
        for (size_t i=0; i < outsize_c; i ++){
            white_pad(colsizes[i] - strlen(printout->text[j][i]) + 1);//one spare space.
            printf("%s", printout->text[j][i]);
            if (i > 0 && i< outsize_c-1) 
                printf(" %s ", apop_opts.output_delimiter);
        }
        printf("\n");
    }

    if (in->more) {
        printf("\n");
        apop_data_show(in->more);
    }
    apop_data_free(printout);
}