Пример #1
0
Polygon Polygon_createSimple (wchar_t *xystring) {
	try {
		long numberOfPoints;
		autoNUMvector<double> xys (NUMstring_to_numbers (xystring, &numberOfPoints), 1);
		if (numberOfPoints < 6) {
			Melder_throw ("There must be at least 3 points (= x,y pairs) in the Polygon");
		}
		if (numberOfPoints % 2 != 0) {
			Melder_throw ("One value is missing.");
		}
		numberOfPoints /= 2; // x,y pairs
		autoPolygon me = Polygon_create (numberOfPoints);
		for (long i = 1; i <= numberOfPoints; i++) {
			my x[i] = xys[2 * i - 1];
			my y[i] = xys[2 * i];
			if (i > 1 && my x[i] == my x[i - 1] && my y[i] == my y[i - 1]) {
				Melder_warning ("Two successives vertices are equal.");
			}
		}
		return me.transfer();
	} catch (MelderError) {
		Melder_throw ("Polygon not created.");
	}
}
Пример #2
0
void dfbuildinternal(const ap::real_2d_array& xy,
     int npoints,
     int nvars,
     int nclasses,
     int ntrees,
     int samplesize,
     int nfeatures,
     int flags,
     int& info,
     decisionforest& df,
     dfreport& rep)
{
    int i;
    int j;
    int k;
    int tmpi;
    int lasttreeoffs;
    int offs;
    int ooboffs;
    int treesize;
    int nvarsinpool;
    bool useevs;
    dfinternalbuffers bufs;
    ap::integer_1d_array permbuf;
    ap::real_1d_array oobbuf;
    ap::integer_1d_array oobcntbuf;
    ap::real_2d_array xys;
    ap::real_1d_array x;
    ap::real_1d_array y;
    int oobcnt;
    int oobrelcnt;
    double v;
    double vmin;
    double vmax;
    bool bflag;

    
    //
    // Test for inputs
    //
    if( npoints<1||samplesize<1||samplesize>npoints||nvars<1||nclasses<1||ntrees<1||nfeatures<1 )
    {
        info = -1;
        return;
    }
    if( nclasses>1 )
    {
        for(i = 0; i <= npoints-1; i++)
        {
            if( ap::round(xy(i,nvars))<0||ap::round(xy(i,nvars))>=nclasses )
            {
                info = -2;
                return;
            }
        }
    }
    info = 1;
    
    //
    // Flags
    //
    useevs = flags/dfuseevs%2!=0;
    
    //
    // Allocate data, prepare header
    //
    treesize = 1+innernodewidth*(samplesize-1)+leafnodewidth*samplesize;
    permbuf.setbounds(0, npoints-1);
    bufs.treebuf.setbounds(0, treesize-1);
    bufs.idxbuf.setbounds(0, npoints-1);
    bufs.tmpbufr.setbounds(0, npoints-1);
    bufs.tmpbufr2.setbounds(0, npoints-1);
    bufs.tmpbufi.setbounds(0, npoints-1);
    bufs.varpool.setbounds(0, nvars-1);
    bufs.evsbin.setbounds(0, nvars-1);
    bufs.evssplits.setbounds(0, nvars-1);
    bufs.classibuf.setbounds(0, 2*nclasses-1);
    oobbuf.setbounds(0, nclasses*npoints-1);
    oobcntbuf.setbounds(0, npoints-1);
    df.trees.setbounds(0, ntrees*treesize-1);
    xys.setbounds(0, samplesize-1, 0, nvars);
    x.setbounds(0, nvars-1);
    y.setbounds(0, nclasses-1);
    for(i = 0; i <= npoints-1; i++)
    {
        permbuf(i) = i;
    }
    for(i = 0; i <= npoints*nclasses-1; i++)
    {
        oobbuf(i) = 0;
    }
    for(i = 0; i <= npoints-1; i++)
    {
        oobcntbuf(i) = 0;
    }
    
    //
    // Prepare variable pool and EVS (extended variable selection/splitting) buffers
    // (whether EVS is turned on or not):
    // 1. detect binary variables and pre-calculate splits for them
    // 2. detect variables with non-distinct values and exclude them from pool
    //
    for(i = 0; i <= nvars-1; i++)
    {
        bufs.varpool(i) = i;
    }
    nvarsinpool = nvars;
    if( useevs )
    {
        for(j = 0; j <= nvars-1; j++)
        {
            vmin = xy(0,j);
            vmax = vmin;
            for(i = 0; i <= npoints-1; i++)
            {
                v = xy(i,j);
                vmin = ap::minreal(vmin, v);
                vmax = ap::maxreal(vmax, v);
            }
            if( ap::fp_eq(vmin,vmax) )
            {
                
                //
                // exclude variable from pool
                //
                bufs.varpool(j) = bufs.varpool(nvarsinpool-1);
                bufs.varpool(nvarsinpool-1) = -1;
                nvarsinpool = nvarsinpool-1;
                continue;
            }
            bflag = false;
            for(i = 0; i <= npoints-1; i++)
            {
                v = xy(i,j);
                if( ap::fp_neq(v,vmin)&&ap::fp_neq(v,vmax) )
                {
                    bflag = true;
                    break;
                }
            }
            if( bflag )
            {
                
                //
                // non-binary variable
                //
                bufs.evsbin(j) = false;
            }
            else
            {
                
                //
                // Prepare
                //
                bufs.evsbin(j) = true;
                bufs.evssplits(j) = 0.5*(vmin+vmax);
                if( ap::fp_less_eq(bufs.evssplits(j),vmin) )
                {
                    bufs.evssplits(j) = vmax;
                }
            }
        }
    }
    
    //
    // RANDOM FOREST FORMAT
    // W[0]         -   size of array
    // W[1]         -   version number
    // W[2]         -   NVars
    // W[3]         -   NClasses (1 for regression)
    // W[4]         -   NTrees
    // W[5]         -   trees offset
    //
    //
    // TREE FORMAT
    // W[Offs]      -   size of sub-array
    //     node info:
    // W[K+0]       -   variable number        (-1 for leaf mode)
    // W[K+1]       -   threshold              (class/value for leaf node)
    // W[K+2]       -   ">=" branch index      (absent for leaf node)
    //
    //
    df.nvars = nvars;
    df.nclasses = nclasses;
    df.ntrees = ntrees;
    
    //
    // Build forest
    //
    offs = 0;
    for(i = 0; i <= ntrees-1; i++)
    {
        
        //
        // Prepare sample
        //
        for(k = 0; k <= samplesize-1; k++)
        {
            j = k+ap::randominteger(npoints-k);
            tmpi = permbuf(k);
            permbuf(k) = permbuf(j);
            permbuf(j) = tmpi;
            j = permbuf(k);
            ap::vmove(&xys(k, 0), 1, &xy(j, 0), 1, ap::vlen(0,nvars));
        }
        
        //
        // build tree, copy
        //
        dfbuildtree(xys, samplesize, nvars, nclasses, nfeatures, nvarsinpool, flags, bufs);
        j = ap::round(bufs.treebuf(0));
        ap::vmove(&df.trees(offs), 1, &bufs.treebuf(0), 1, ap::vlen(offs,offs+j-1));
        lasttreeoffs = offs;
        offs = offs+j;
        
        //
        // OOB estimates
        //
        for(k = samplesize; k <= npoints-1; k++)
        {
            for(j = 0; j <= nclasses-1; j++)
            {
                y(j) = 0;
            }
            j = permbuf(k);
            ap::vmove(&x(0), 1, &xy(j, 0), 1, ap::vlen(0,nvars-1));
            dfprocessinternal(df, lasttreeoffs, x, y);
            ap::vadd(&oobbuf(j*nclasses), 1, &y(0), 1, ap::vlen(j*nclasses,(j+1)*nclasses-1));
            oobcntbuf(j) = oobcntbuf(j)+1;
        }
    }
    df.bufsize = offs;
    
    //
    // Normalize OOB results
    //
    for(i = 0; i <= npoints-1; i++)
    {
        if( oobcntbuf(i)!=0 )
        {
            v = double(1)/double(oobcntbuf(i));
            ap::vmul(&oobbuf(i*nclasses), 1, ap::vlen(i*nclasses,i*nclasses+nclasses-1), v);
        }
    }
    
    //
    // Calculate training set estimates
    //
    rep.relclserror = dfrelclserror(df, xy, npoints);
    rep.avgce = dfavgce(df, xy, npoints);
    rep.rmserror = dfrmserror(df, xy, npoints);
    rep.avgerror = dfavgerror(df, xy, npoints);
    rep.avgrelerror = dfavgrelerror(df, xy, npoints);
    
    //
    // Calculate OOB estimates.
    //
    rep.oobrelclserror = 0;
    rep.oobavgce = 0;
    rep.oobrmserror = 0;
    rep.oobavgerror = 0;
    rep.oobavgrelerror = 0;
    oobcnt = 0;
    oobrelcnt = 0;
    for(i = 0; i <= npoints-1; i++)
    {
        if( oobcntbuf(i)!=0 )
        {
            ooboffs = i*nclasses;
            if( nclasses>1 )
            {
                
                //
                // classification-specific code
                //
                k = ap::round(xy(i,nvars));
                tmpi = 0;
                for(j = 1; j <= nclasses-1; j++)
                {
                    if( ap::fp_greater(oobbuf(ooboffs+j),oobbuf(ooboffs+tmpi)) )
                    {
                        tmpi = j;
                    }
                }
                if( tmpi!=k )
                {
                    rep.oobrelclserror = rep.oobrelclserror+1;
                }
                if( ap::fp_neq(oobbuf(ooboffs+k),0) )
                {
                    rep.oobavgce = rep.oobavgce-log(oobbuf(ooboffs+k));
                }
                else
                {
                    rep.oobavgce = rep.oobavgce-log(ap::minrealnumber);
                }
                for(j = 0; j <= nclasses-1; j++)
                {
                    if( j==k )
                    {
                        rep.oobrmserror = rep.oobrmserror+ap::sqr(oobbuf(ooboffs+j)-1);
                        rep.oobavgerror = rep.oobavgerror+fabs(oobbuf(ooboffs+j)-1);
                        rep.oobavgrelerror = rep.oobavgrelerror+fabs(oobbuf(ooboffs+j)-1);
                        oobrelcnt = oobrelcnt+1;
                    }
                    else
                    {
                        rep.oobrmserror = rep.oobrmserror+ap::sqr(oobbuf(ooboffs+j));
                        rep.oobavgerror = rep.oobavgerror+fabs(oobbuf(ooboffs+j));
                    }
                }
            }
            else
            {
                
                //
                // regression-specific code
                //
                rep.oobrmserror = rep.oobrmserror+ap::sqr(oobbuf(ooboffs)-xy(i,nvars));
                rep.oobavgerror = rep.oobavgerror+fabs(oobbuf(ooboffs)-xy(i,nvars));
                if( ap::fp_neq(xy(i,nvars),0) )
                {
                    rep.oobavgrelerror = rep.oobavgrelerror+fabs((oobbuf(ooboffs)-xy(i,nvars))/xy(i,nvars));
                    oobrelcnt = oobrelcnt+1;
                }
            }
            
            //
            // update OOB estimates count.
            //
            oobcnt = oobcnt+1;
        }
    }
    if( oobcnt>0 )
    {
        rep.oobrelclserror = rep.oobrelclserror/oobcnt;
        rep.oobavgce = rep.oobavgce/oobcnt;
        rep.oobrmserror = sqrt(rep.oobrmserror/(oobcnt*nclasses));
        rep.oobavgerror = rep.oobavgerror/(oobcnt*nclasses);
        if( oobrelcnt>0 )
        {
            rep.oobavgrelerror = rep.oobavgrelerror/oobrelcnt;
        }
    }
}