예제 #1
0
파일: radiosity.c 프로젝트: mattjakob/s3d
Color *radiosity_prog(int n, Poly **p, Color *e, Color *rho)
{
  int src, rcv, iter = 0;
  Real ff, mts, *a = NEWTARRAY(n, Real);
  Color d, *dm = NEWTARRAY(n, Color);
  Color ma, *m = NEWTARRAY(n, Color);
	
  initialize(n, m, dm, a, p, e);
  while (iter-- < max_iter) {
    src = select_shooter(n, dm, a);
    if (converged(src, dm))
      break;
    for (rcv = 0; rcv < n; rcv++) {
      if (rcv == src || (ff = formfactor(src, rcv, n, p, a)) < REL_EPS)
	continue;
      d = c_scale(ff, c_mult(rho[rcv], dm[src]));

      m[rcv] = c_add(m[rcv], d);
      dm[rcv] = c_add(dm[rcv], d);
    }
    dm[src] = c_make(0,0,0);
  }
  ma = ambient_rad(n, dm, a);
  for (rcv = 0; rcv < n; rcv++)
    m[rcv] = c_add(m[rcv], ma);
  efree(a), efree(dm);
  return m;
}
static void make_filter(T *result_b, T *result_a, const complex4c *alpha, const complex4c *beta, int K, T sigma)
{
    const double denom = sigma * M_SQRT2PI;
    complex4c b[DERICHE_MAX_K], a[DERICHE_MAX_K + 1];
    int k, j;
        
    b[0] = alpha[0];    /* Initialize b/a = alpha[0] / (1 + beta[0] z^-1) */
    a[0] = make_complex(1, 0);
    a[1] = beta[0];
    
    for (k = 1; k < K; ++k)
    {   /* Add kth term, b/a += alpha[k] / (1 + beta[k] z^-1) */
        b[k] = c_mul(beta[k], b[k-1]);
        
        for (j = k - 1; j > 0; --j)
            b[j] = c_add(b[j], c_mul(beta[k], b[j - 1]));
        
        for (j = 0; j <= k; ++j)
            b[j] = c_add(b[j], c_mul(alpha[k], a[j]));
        
        a[k + 1] = c_mul(beta[k], a[k]);
        
        for (j = k; j > 0; --j)
            a[j] = c_add(a[j], c_mul(beta[k], a[j - 1]));
    }
    
    for (k = 0; k < K; ++k)
    {
        result_b[k] = (T)(b[k].real / denom);
        result_a[k + 1] = (T)a[k + 1].real;
    }
    
    return;
}
예제 #3
0
파일: rshade.c 프로젝트: mattjakob/s3d
Color ray_shade(int level, Real w, Ray v, RContext *rc, Object *ol)
{
  Inode *i = ray_intersect(ol, v);
  if (i != NULL) { Light *l; Real wf;
    Material *m = i->m;
    Vector3 p = ray_point(v, i->t); 
    Cone  recv = cone_make(p, i->n, PIOVER2);
    Color c = c_mult(m->c, c_scale(m->ka, ambient(rc)));
    rc->p = p;

    for (l = rc->l; l != NULL; l = l->next) 
      if ((*l->transport)(l, recv, rc) && (wf = shadow(l, p, ol)) > RAY_WF_MIN)
	c = c_add(c, c_mult(m->c,
		     c_scale(wf * m->kd * v3_dot(l->outdir,i->n), l->outcol)));

    if (level++ < MAX_RAY_LEVEL) {
      if ((wf = w * m->ks) > RAY_WF_MIN) {
	Ray r = ray_make(p, reflect_dir(v.d, i->n));
        c = c_add(c, c_mult(m->s,
		     c_scale(m->ks, ray_shade(level, wf, r, rc, ol))));
      }
      if ((wf = w * m->kt) > RAY_WF_MIN) {
	Ray t = ray_make(p, refract_dir(v.d, i->n, (i->enter)? 1/m->ir: m->ir));
	if (v3_sqrnorm(t.d) > 0) {
	  c = c_add(c, c_mult(m->s,
		       c_scale(m->kt, ray_shade(level, wf, t, rc, ol))));
	}
      }
    }
    inode_free(i); 
    return c;
  } else {
    return BG_COLOR;
  }
}
예제 #4
0
파일: material.c 프로젝트: cheque/s3d
Color shiny_surface(RContext *rc)
{
  Color ce = environment_map(rc->m->tinfo, reflect_dir(rc->v, rc->n));

  return c_add(c_scale(rc->m->ka, ambient(rc)),
            c_scale(rc->m->ks, c_add(ce, specular(rc))));
}
예제 #5
0
파일: material.c 프로젝트: cheque/s3d
Color textured_plastic(RContext *rc)
{
  Color c, ct = texture_map(rc->m->tinfo, rc->t);

  c = c_add(c_mult(ct, c_add(c_scale(rc->m->ka, ambient(rc)),
                             c_scale(rc->m->kd, diffuse(rc)))),
            c_mult(rc->m->s, c_scale(rc->m->ks, specular(rc))));
  return c;
}
예제 #6
0
void polymult (header *hd)
{	header *st=hd,*hd1,*result;
	int c,c1,c2,i,r,j,k;
	double *m1,*m2,*mr,x;
	complex *mc1,*mc2,*mcr,xc,hc;
	interval *mi1,*mi2,*mir,xi,hi;
	hd1=next_param(st);
	equal_params_2(&hd,&hd1); if (error) return;
	getmatrix(hd,&r,&c1,&m1); if (r!=1) wrong_arg();
	getmatrix(hd1,&r,&c2,&m2); if (r!=1) wrong_arg();
	if ((LONG)c1+c2-1>INT_MAX) wrong_arg();
	c=c1+c2-1;
	if (iscomplex(hd))
	{	mc1=(complex *)m1; mc2=(complex *)m2;
		result=new_cmatrix(1,c,""); if (error) return;
		mcr=(complex *)matrixof(result);
		c_copy(xc,*mc1); mc1++;
		for (i=0; i<c2; i++) c_mult(xc,mc2[i],mcr[i]);
		for (j=1; j<c1; j++)
		{	c_copy(xc,*mc1); mc1++;
			for (k=j,i=0; i<c2-1; i++,k++)
			{	c_mult(xc,mc2[i],hc);
				c_add(hc,mcr[k],mcr[k]);
			}
			c_mult(xc,mc2[i],mcr[k]);
		}
	}
	else if (isinterval(hd))
	{	mi1=(interval *)m1; mi2=(interval *)m2;
		result=new_imatrix(1,c,""); if (error) return;
		mir=(interval *)matrixof(result);
		i_copy(xi,*mi1); mi1++;
		for (i=0; i<c2; i++) i_mult(xi,mi2[i],mir[i]);
		for (j=1; j<c1; j++)
		{	i_copy(xi,*mi1); mi1++;
			for (k=j,i=0; i<c2-1; i++,k++)
			{	i_mult(xi,mi2[i],hi);
				c_add(hi,mir[k],mir[k]);
			}
			c_mult(xi,mi2[i],mir[k]);
		}
	}
	else if (isreal(hd))
	{	result=new_matrix(1,c,""); if (error) return;
		mr=matrixof(result);
		x=*m1++;
		for (i=0; i<c2; i++) mr[i]=x*m2[i];
		for (j=1; j<c1; j++)
		{	x=*m1++;
			for (k=j,i=0; i<c2-1; i++,k++) mr[k]+=x*m2[i];
			mr[k]=x*m2[i];
		}
	}
	else wrong_arg();
	moveresult(st,result);
}
예제 #7
0
파일: mie.c 프로젝트: JiapengHuang/SPP
struct c_complex Lentz_Dn(struct c_complex z,long n)

/*:10*/
#line 126 "./mie.w"

{
struct c_complex alpha_j1,alpha_j2,zinv,aj;
struct c_complex alpha,result,ratio,runratio;

/*12:*/
#line 156 "./mie.w"


zinv= c_sdiv(2.0,z);
alpha= c_smul(n+0.5,zinv);
aj= c_smul(-n-1.5,zinv);
alpha_j1= c_add(aj,c_inv(alpha));
alpha_j2= aj;
ratio= c_div(alpha_j1,alpha_j2);
runratio= c_mul(alpha,ratio);


/*:12*/
#line 131 "./mie.w"


do
/*13:*/
#line 179 "./mie.w"

{
aj.re= zinv.re-aj.re;
aj.im= zinv.im-aj.im;
alpha_j1= c_add(c_inv(alpha_j1),aj);
alpha_j2= c_add(c_inv(alpha_j2),aj);
ratio= c_div(alpha_j1,alpha_j2);
zinv.re*= -1;
zinv.im*= -1;
runratio= c_mul(ratio,runratio);
}

/*:13*/
#line 134 "./mie.w"


while(fabs(c_abs(ratio)-1.0)> 1e-12);

result= c_add(c_sdiv((double)-n,z),runratio);
return result;
}
예제 #8
0
void calc_aver(FILE *fp,int nframes,int npair,t_pair pair[],t_sij *spec,
	       real maxdist)
{
  int     i,j,m;
  real    nf_1,fac,md_6;
  complex c1,c2,dc2;
  
  md_6 = pow(maxdist,-6.0);
  fac  = 4*M_PI/5;
  nf_1 = 1.0/nframes;
  for(i=0; (i<npair); i++) {
    c2.re = 0;
    c2.im = 0;
    fprintf(fp,"%5d  %5d",pair[i].ai,pair[i].aj);
    for(m=0; (m<5); m++) {
      c1.re  = spec[i].Ylm[m].re*nf_1;
      c1.im  = spec[i].Ylm[m].im*nf_1;
      dc2    = c_sqr(c1);
      c2     = c_add(dc2,c2);
      
      if (c1.im > 0)
	fprintf(fp,"  %8.3f+i%8.3f",c1.re,c1.im);
      else
	fprintf(fp,"  %8.3f-i%8.3f",c1.re,-c1.im);
    }
    fprintf(fp,"\n");
    spec[i].rij_3 *= nf_1;
    spec[i].rij_6 *= nf_1;
    spec[i].y2.re  = fac*c2.re;
    spec[i].y2.im  = fac*c2.im;
    spec[i].bNOE   = (spec[i].rij_6 > md_6);
  }
}
예제 #9
0
// 1D Fourier Transform: fft_1d_helper(x, N, stride, y)
// if N > 1 then
//    1. take the FFT of the even elements in x, and store it in the first half of y
//         x' = x
//         stride' = 2 * stride
//         y' = y
//    2. take the FFT of the odd elements in x, and store it in the second half of y
//         x' = x + stride
//         stride' = 2 * stride
//         y' = y + N/2
//    3. for each frequency step k in 0 ... N/2
//       i. compute the component at Wkn = exp(-j*2*pi/N * k)
//          y[k] = y[k] + Wkn * y[k + N/2]
//       ii. compute the component at Wkn = exp(-j*2*pi/N * (k + N/2))
//          y[k + N/2] = y[k] + Wkn * y[k + N/2]
// otherwise just return the element at x[0]
void fft_1d_helper(complex* x, int N, int stride, complex* y,
                   complex* Wkn, int Wkn_stride){
  if(N > 1) {
    fft_1d_helper(x, N/2, 2*stride, y, Wkn, 2*Wkn_stride);
    fft_1d_helper(x + stride, N/2, 2*stride, y + N/2, Wkn, 2*Wkn_stride);
    
    for(int k=0; k<N/2; k++) {
      complex Wkn1 = Wkn[k*Wkn_stride];
      complex Wkn2 = Wkn[(k + N/2)*Wkn_stride];
      complex yk1 = c_add(y[k], c_mult(Wkn1, y[k + N/2]));
      complex yk2 = c_add(y[k], c_mult(Wkn2, y[k + N/2]));
      y[k] = yk1;
      y[k+N/2] = yk2;
    }
  } else {
    y[0] = x[0];
  }
}
예제 #10
0
파일: radiosity.c 프로젝트: mattjakob/s3d
static Color ambient_rad(int n, Color *dm, Real *a)
{
  int i;
  Real aa = 0;
  Color ma = c_make(0,0,0);
  for (i = 0; i < n; i++) {
    ma = c_add(ma, c_scale(a[i], dm[i]));
    aa += a[i];
  }
  return c_scale(1.0/aa, ma);
}
예제 #11
0
파일: pol.c 프로젝트: rforge/muste
static struct complex *pol_value(
struct polynom *p,
struct complex *z,  /* pointer to argument */
struct complex *v   /* pointer to function value */
)
        {
        int i;

        v->x=p->a[p->n].x;
        v->y=p->a[p->n].y;
        for (i=p->n-1; i>=0; --i)
            c_add(v,&(p->a[i]),c_mult(v,z,v));
        return(v);
        }
예제 #12
0
/**
 * \brief Compute the variance of the impulse response
 * \param poles0    unscaled pole locations
 * \param q         rescaling parameter
 * \param K         number of poles
 * \return variance achieved by poles = poles0^(1/q)
 * \ingroup vyv_gaussian
 */
static double variance(const complex4c *poles0, int K, double q)
{
	complex4c sum = { 0, 0 };
	int k;

	for (k = 0; k < K; ++k)
	{
		complex4c z = c_real_pow(poles0[k], 1 / q), denom = z;
		denom.real -= 1;
		/* Compute sum += z / (z - 1)^2. */
		sum = c_add(sum, c_div(z, c_mul(denom, denom)));
	}

	return 2 * sum.real;
}
예제 #13
0
//SUMMARY
// Adds the cache entry 'ent' to the end
//NOTES
// Must be locked over the cache - can be unlocked immediately following the call
// No other synchronization needed, since old entries will automatically be displaced once cache is full
void cache_putEntry(struct cache_entry* ent) {

	assert( ent->filedata != NULL );

	fprintf(stderr, "cache_putEntry: Adding entry for filename %s\n", ent->filename);

	assert( cache_size <= max_cache_size );

	if (cache_size == max_cache_size) {
		fprintf(stderr, "cache_putEntry: Cache full, shifting (%d, %d)\n", cache_size, max_cache_size);
		destroyCacheEntry( c_shift() ); // This will also free the file data stored in the entry
	}

	c_add(ent);

}
예제 #14
0
파일: pol.c 프로젝트: rforge/muste
static struct polynom *pol_mult(struct polynom *p,struct polynom *p1,struct polynom *p2)
        {
        int i,j;
        struct complex tulo;

        p->n=p1->n+p2->n;
        if (p->n>MAXN-1) { pol_dim_overflow(); return(p); }

        for (i=0; i<=p->n; ++i)
            p->a[i].x=p->a[i].y=0.0;
        for (i=0; i<=p1->n; ++i)
            for (j=0; j<=p2->n; ++j)
                c_add(&(p->a[i+j]),&(p->a[i+j]),
                          c_mult(&tulo,&(p1->a[i]),&(p2->a[j])));

        return(p);
        }
예제 #15
0
파일: mie.c 프로젝트: JiapengHuang/SPP
void Dn_down(struct c_complex z,long nstop,struct c_complex*D)

/*:19*/
#line 247 "./mie.w"

{
long k;
struct c_complex zinv,k_over_z;

D[nstop-1]= Lentz_Dn(z,nstop);
zinv= c_inv(z);

for(k= nstop-1;k>=1;k--){
k_over_z= c_smul((double)k,zinv);
D[k-1]= c_sub(k_over_z,c_inv(c_add(D[k],k_over_z)));
}
}
예제 #16
0
/**
 * \brief Derivative of variance with respect to q
 * \param poles0    unscaled pole locations
 * \param q         rescaling parameter
 * \param K         number of poles
 * \return derivative of variance with respect to q
 * \ingroup vyv_gaussian
 *
 * This function is used by compute_q() in solving for q.
 */
static double dq_variance(const complex4c *poles0, int K, double q)
{
	complex4c sum = { 0, 0 };
	int k;

	for (k = 0; k < K; ++k)
	{
		complex4c z = c_real_pow(poles0[k], 1 / q), w = z, denom = z;
		w.real += 1;
		denom.real -= 1;
		/* Compute sum += z log(z) (z + 1) / (z - 1)^3 */
		sum = c_add(sum, c_div(c_mul(c_mul(z, c_log(z)), w),
			c_real_pow(denom, 3)));
	}

	return (2 / q) * sum.real;
}
예제 #17
0
void polyadd (header *hd)
{	header *st=hd,*hd1,*result;
	int c,c1,c2,i,r;
	double *m1,*m2,*mr;
	complex *mc1,*mc2,*mcr;
	interval *mi1,*mi2,*mir;
	hd1=next_param(st);
	equal_params_2(&hd,&hd1); if (error) return;
	getmatrix(hd,&r,&c1,&m1); if (r!=1) wrong_arg();
	getmatrix(hd1,&r,&c2,&m2); if (r!=1) wrong_arg();
	c=max(c1,c2);
	if (iscomplex(hd)) /* complex values */
	{	mc1=(complex *)m1; mc2=(complex *)m2;
		result=new_cmatrix(1,c,""); if (error) return;
		mcr=(complex *)matrixof(result);
		for (i=0; i<c; i++)
		{	if (i>=c1) { c_copy(*mcr,*mc2); mcr++; mc2++; }
			else if (i>=c2) { c_copy(*mcr,*mc1); mcr++; mc1++; }
			else { c_add(*mc1,*mc2,*mcr); mc1++; mc2++; mcr++; }
		}
	}
	else if (isinterval(hd))
	{	mi1=(interval *)m1; mi2=(interval *)m2;
		result=new_imatrix(1,c,""); if (error) return;
		mir=(interval *)matrixof(result);
		for (i=0; i<c; i++)
		{	if (i>=c1) { i_copy(*mir,*mi2); mir++; mi2++; }
			else if (i>=c2) { i_copy(*mir,*mi1); mir++; mi1++; }
			else { i_add(*mi1,*mi2,*mir); mi1++; mi2++; mir++; }
		}
	}
	else if (isreal(hd))
	{	result=new_matrix(1,c,""); if (error) return;
		mr=matrixof(result);
		for (i=0; i<c; i++)
		{	if (i>=c1) { *mr++ = *m2++; }
			else if (i>=c2) { *mr++ = *m1++; }
			else { *mr++ = *m1++ + *m2++; }
		}
	}
	else wrong_arg();
	moveresult(st,result);
}
예제 #18
0
void rfft (long m0, long p0, long q0, long n)
/***** rfft 
	make a fft on x[m],x[m+q0],...,x[m+(p0-1)*q0] (p points).
	one has xi_p0 = xi_n^n = zz[n] ; i.e., p0*n=nn.
*****/
{	long p,q,m,l;
	long mh,ml;
	int found=0;
	complex sum,h;
	if (p0==1) return;
	if (test_key()==escape) { error=301; return; }
	if (p0%2==0) { p=p0/2; q=2; }
	else
	{	q=3;
		while (q*q<=p0)
		{	if (p0%q==0) 
			{	found=1; break; }
			q+=2;
		}
		if (found) p=p0/q;
		else { q=p0; p=1; }
	}
	if (p>1) for (m=0; m<q; m++) 
		rfft((m0+m*q0)%nn,p,q*q0,nn/p);
	mh=m0;
	for (l=0; l<p0; l++)
	{	ml=l%p;
		c_copy(sum,ff[(m0+ml*q*q0)%nn]);
		for (m=1; m<q; m++)
		{	c_mult(ff[(m0+(m+ml*q)*q0)%nn],zz[(n*l*m)%nn],h);
			c_add(sum,h,sum);
		}
		sum[0]/=q; sum[1]/=q;
		c_copy(fh[mh],sum);
		mh+=q0; if (mh>=nn) mh-=nn;
	}
	for (l=0; l<p0; l++)
	{	c_copy(ff[m0],fh[m0]);
		m0+=q0;
	}
}
예제 #19
0
// resample_1d(complex* x, int N, int stride, float* n)
// 1. create a temporary buffer of size N
// 2. for step i in 0 ... N
// 3.   if n[i] is outside of range 0 ... N, then x_interp = 0
// 4.   get the lower element x_low = x[floor(n[i]) * stride]
// 5.   get the higher element x_high = x[ceil(n[i]) * stride]
// 6.   compute the interpolation mixing ratio, ratio = n[i] - floor(n[i])
// 7.   compute the interpolated value, x_interp = (1 - ratio) * x_low + ratio * x_high
// 8.   store the interpolated value into the buffer
// 9. copy the values from the temporary buffer back into x
void resample_1d(complex* x, int N, int stride, float* n) {  
  int i;
  complex buffer[N];

  for(i=0; i<N; i++){
    complex x_interp;
    if(n[i] < 0 || n[i] > N - 1){
      x_interp.real = 0;
      x_interp.imag = 0;
    } else {
      complex x_low = x[(int)floor(n[i]) * stride];
      complex x_high = x[(int)ceil(n[i]) * stride];
      float ratio = n[i] - floor(n[i]);
      x_interp = c_add(c_scalar_mult(x_low, 1 - ratio), c_scalar_mult(x_high, ratio));
    }
    buffer[i] = x_interp;
  }

  for(i=0; i<N; i++)
    x[i*stride] = buffer[i];
}
예제 #20
0
void polyzeros (header *hd)
{	header *st=hd,*result;
	int i,j,r,c;
	double *m,*mr,x;
	complex *mc,*mcr,xc,hc;
	hd=getvalue(hd); if (error) return;
	if (hd->type==s_real || hd->type==s_matrix)
	{	getmatrix(hd,&r,&c,&m);
		if (r!=1) wrong_arg();
		result=new_matrix(1,c+1,""); if (error) return;
		mr=matrixof(result);
		mr[0]=-m[0]; mr[1]=1.0;
		for (i=1; i<c; i++)
		{	x=-m[i]; mr[i+1]=1.0;
			for (j=i; j>=1; j--) mr[j]=mr[j-1]+x*mr[j];
			mr[0]*=x;
		}
	}
	else if (hd->type==s_complex || hd->type==s_cmatrix)
	{	getmatrix(hd,&r,&c,&m); mc=(complex *)m;
		if (r!=1) wrong_arg();
		result=new_cmatrix(1,c+1,""); if (error) return;
		mcr=(complex *)matrixof(result);
		mcr[0][0]=-mc[0][0]; mcr[0][1]=-mc[0][1];
		mcr[1][0]=1.0; mcr[1][1]=0.0;
		for (i=1; i<c; i++)
		{	xc[0]=-mc[i][0]; xc[1]=-mc[i][1];
			mcr[i+1][0]=1.0; mcr[i+1][1]=0.0;
			for (j=i; j>=1; j--) 
			{	c_mult(xc,mcr[j],hc);
				c_add(hc,mcr[j-1],mcr[j]);
			}
			c_mult(xc,mcr[0],mcr[0]);
		}
	}
	else wrong_arg();
	moveresult(st,result);
}
예제 #21
0
파일: pol.c 프로젝트: rforge/muste
static struct polynom *pol_add(struct polynom *p,struct polynom *p1,struct polynom *p2)
        {
        int i;
// RS REM        struct complex tulo;

        p->n=(p1->n>p2->n)? (p1->n):(p2->n);

        for (i=0; i<=p->n; ++i)
            {
            if (i<=p1->n)
                {
                if (i<=p2->n)
                    c_add(&(p->a[i]),&(p1->a[i]),&(p2->a[i]));
                else
                    { p->a[i].x=p1->a[i].x; p->a[i].y=p1->a[i].y; }
                }
            else
                { p->a[i].x=p2->a[i].x; p->a[i].y=p2->a[i].y; }
            }
        i=p->n;
        while (c_zero(&(p->a[i])) && i>0) --i;
        p->n=i;
        return(p);
        }
예제 #22
0
/////////////////////////////////////////////////////////////////////////////////////////////
//int main(int argc, char *argv[ ])
//{
 int main(void)
 {
   	sortingindex = 0;
	if((fpp = fopen(INPUT_FILE, "r")) == NULL)
	{
		printf("Cannot open 'parameter_file'.\n");
	}

	for(j = 0; j < 11; j++)
	{
		if(fscanf(fpp, "%lf", &tmp) != EOF) 
		{
				my_array[j] = tmp;
				
		} else 		{
			printf("Not enough data in 'input_parameter'!");
		}
	}

	fclose(fpp);
 

   	In_n= (int) my_array[0];
	In_vect_n= (int) my_array[1];
	Out_n= (int) my_array[2];
	Mf_n= (int) my_array[3];
	training_data_n= (int) my_array[4];
	checking_data_n= (int) my_array[5];
	epoch_n= (int) my_array[6];
	step_size=my_array[7];
	increase_rate=my_array[8];
	decrease_rate=my_array[9];
	threshold = my_array[10];
		
	Rule_n = (int)pow((double)Mf_n, (double)In_n); //number of rules 
	Node_n = In_n + In_n*Mf_n + 3*Rule_n + In_n*Rule_n + Out_n;

	/* allocate matrices and memories */
	int trnnumcheck[training_data_n + 1];
	int trnnumchecku[training_data_n + 1];
	for(i=0; i<training_data_n +1; i++)
	{
	trnnumcheck[i]=0;
	trnnumchecku[i]=0;
	}
	
	diff =(double **)create_matrix(Out_n, training_data_n, sizeof(double)); 
	double chkvar[checking_data_n];
	double cdavg[checking_data_n];
	double chkvar_un[checking_data_n];
	double cdavg_un[checking_data_n];
	target = calloc(Out_n, sizeof(double));
	de_out = calloc(Out_n, sizeof(double));
	node_p = (NODE_T **)create_array(Node_n, sizeof(NODE_T *)); 
	config = (int **)create_matrix(Node_n, Node_n, sizeof(int)); 
	training_data_matrix = (double **)create_matrix(training_data_n, In_n*In_vect_n + Out_n, sizeof(double));
	if(checking_data_n > 0)
	{
		checking_data_matrix = (double **)create_matrix(checking_data_n, In_n*In_vect_n +Out_n, sizeof(double));
		checking_data_matrix_un = (double **)create_matrix(checking_data_n, Out_n, sizeof(double));
		chk_output =  (double **)create_matrix(checking_data_n, Out_n, sizeof(double));
	}
	layer_1_to_4_output = (COMPLEX_T **)create_matrix(training_data_n, In_n*Mf_n + 3*Rule_n, sizeof(COMPLEX_T));
	trn_rmse_error = calloc(epoch_n, sizeof(double));
	trnNMSE = calloc(epoch_n, sizeof(double));
	chk_rmse_error = calloc(epoch_n, sizeof(double));
	kalman_parameter = (double **)create_matrix(Out_n ,(In_n*In_vect_n + 1)*Rule_n, sizeof(double)); 
	kalman_data = (double **)create_matrix(Out_n ,(In_n*In_vect_n + 1)*Rule_n, sizeof(double));
	step_size_array = calloc(epoch_n, sizeof(double));
	ancfis_output = (double **)create_matrix(training_data_n , Out_n, sizeof(double)); 
	trn_error =calloc(Out_n +1, sizeof(double));
	chk_error_n = calloc(Out_n +1, sizeof(double));// changing size for adding new error measures
	chk_error_un = calloc(Out_n +1, sizeof(double));// changing size for adding new error measures
	trn_datapair_error = calloc(training_data_n, sizeof(double));
	trn_datapair_error_sorted = (double **)create_matrix(2,training_data_n, sizeof(double));
	NMSE = calloc(Out_n, sizeof(double));
	NDEI = calloc(Out_n, sizeof(double));
	unNMSE = calloc(Out_n, sizeof(double));
	unNDEI = calloc(Out_n, sizeof(double));
	//Build Matrix of 0 nd 1 to show the connected nodes
	gen_config(In_n, Mf_n,Out_n, config);//gen_config.c
	//With the above matrix, build ANCFIS connected nodes
	build_ancfis(config); //datastru.c
	//Find total number of nodes in layer 1 and 5
	parameter_n = set_parameter_mode(); //datastru.c
	parameter_array = calloc(parameter_n, sizeof(double));
	initpara(TRAIN_DATA_FILE, training_data_n, In_n, In_vect_n+1, Mf_n); // initpara.c
// after this step, the parameters (they are present in layer 1 and layer 5 only) are assigned a random initial value 
// using some basic algebra and these value are then stored in "para.ini"
	get_parameter(node_p,Node_n,INIT_PARA_FILE); //input.c
// after this step, the initial random values of the parametrs are read from "oara.ini" and assigned to the appropriate nodes in the node structure by accessing their para list.
	//Get training and testing data
	get_data(TRAIN_DATA_FILE, training_data_n, training_data_matrix); //input.c
// after this step, the training input data is read from the "data.trn" fle and stroed in the training data matrix.
	get_data(CHECK_DATA_FILE, checking_data_n, checking_data_matrix); //input.c
// after the above step, the checking data is read from the "data.chk" file and then stored in the checking data matrix.

	for(i=0; i< Out_n; i++)
	{
	for(j=0; j<training_data_n; j++)
	{
	trnavg = trnavg + training_data_matrix[j][(i+1)*In_vect_n +i];
	}
	}
	trnavg = trnavg /(Out_n * training_data_n);
	
	for(i=0; i< Out_n; i++)
	{
	for(j=0; j<training_data_n; j++)
	{
	temp = training_data_matrix[j][(i+1)*In_vect_n +i]- trnavg;
	temp = temp*temp;
	trnvariance = trnvariance + temp;
	}
	}
	trnvariance = trnvariance /((Out_n * training_data_n)-1);

	temp = 0.0;
	for(i=0; i< Out_n; i++)
	{
	for(j=0; j< checking_data_n; j++)
	{
	chkavg = chkavg + checking_data_matrix[j][(i+1)*In_vect_n +i];
	}
	}
	chkavg = chkavg /(Out_n * checking_data_n);
	
	for(i=0; i< Out_n; i++)
	{
	for(j=0; j<checking_data_n; j++)
	{
	temp = checking_data_matrix[j][(i+1)*In_vect_n +i]- chkavg;
	temp = temp*temp;
	chkvariance = chkvariance + temp;
	}
	}
	chkvariance = chkvariance /((Out_n * checking_data_n)-1);
	printf("epochs \t trn error \t tst error\n");
	printf("------ \t --------- \t ---------\n");
	//printf("not entering the epoch loop and the i loop yoyoyo\n");
/**************
	for(ep_n = 0; ep_n < epoch_n; ep_n++)
	{ 
		//step_size_pointer= &step_size;		
		//printf("epoch numbernumber %d \n", ep_n);	
		//step_size_array[ep_n] = step_size_pointer;
		step_size_array[ep_n] = step_size;
	// after the above step, the updated stepsize at the end of the last loop is stored in the step_size_array.
	// this will keep happening every time we start en epoch and hence at the end of the loop, step_size_array will 
	// have a list of all the updated step sizes. Since this is a offline version, step sizes are updated only
	// at the end of an epoch. 
		for(m = 0; m < Out_n; m++)
		{ 	
			//printf("m loop number %d \n", m);	
			for(j = 0; j < training_data_n; j++)
			{ 
				//printf("j loop number %d \n", j);				
				//copy the input vector(s) to input node(s)
				put_input_data(node_p,j, training_data_matrix); //input.c
	// after this(above) step, the input data is transferred frm the training data matrix to the "node" structure.
				//printf("testing \n");	
				//printf("reeeetesting \n");	
				target[m] = training_data_matrix[j][(m+1)*In_vect_n+m]; // *** 
	// this step assigns the value of the "m"th output of "j" th trainig data pair to target.
				//printf("testing \n");	
				//forward pass, get node outputs from layer 1 to layer 4
				calculate_output(In_n, In_n + In_n*Mf_n + 3*Rule_n - 1, j); //forward.c
	// after this step, output of nodes in layer 1 to 4 is calculated. Please note that when this happens for the first
	// time, i.e. when ep_n=0, our network parametrs are already initialized. thus, it is possible to get the
	// output of each node using the function definitios proposed in forward.c. After first epoch, our parametrs get 
	// updated and this output is then calculated using teh new parameters. The essential point to note here is that
	// we can always calculate the output of each node since we have already initialized our parameters.
				//printf("testing \n");	
				//put outputs of layer 1 to 4 into layer_1_to_4_output
		
				for(k = 0; k < Mf_n*In_n + 3*Rule_n; k++)
				{
				//printf("testing \n");	
				layer_1_to_4_output[j][k] = *node_p[k + In_n]->value;
				}
	// the above loop simply puts the values of nodes from layer 1 to layer 4 in the layer_1_to_4_output matrix.

				//identify layer 5 params using LSE (Kalman filter)
				//printf("testing \n");	
				get_kalman_data(kalman_data, target); //kalman.c
	// this function call finds out the values of O4iXnl .. these are basically the coefficients
	// of the kalman parametrs for a given training data pair
	//puts them in kalman_data matrix.
	// this kalman_data matrix has In_n number of rows and number of columns equal to number of parametrs that are
	// responsible for determining each output... as stated above, the outputs are actually the coefficients of the
	// parameters.

				//printf("testing \n");	
				//calculate Kalman parameters
				
				kalman(ep_n, j+(m*training_data_n), m, kalman_data, kalman_parameter,target); //kalman.c
	// this function call evaluates kalman parametrs for a given output, for a given epoch.. that is it takes the epoch 
	// number from us, takes the info about how many times has kalman been invoked before, also takes in the
	// output number(row number) for whihc the parametrs are to be found out... it also takes kalman_data and reads 
	// from it to estimate the kalman parameters... it also takes target .. and stores the output in the mth row of 
	// kalman_parameter.
				//printf("testing \n");	
			}
	// let me tell u what the abopve loop is doing.. after observing closely, it is easy to see that in the above loop, 
	// for a given output node, one by one, all the training data are taken inside the ANCFIS structure, outputs
	// are calculated from one to 4, then a recursive kalman filetr is used to identify the kalman
	// parametrs corresponding to the output node.. these kalman parameters are updated after every tarining data pair 
	// and finally at the end of all the training data, we have an estimate for the kalman parametrs corresponding to 		// the output node.
		}
	// thus, at the of the above loop, the kalman parametrs for all the output nodes are evaluated...

	// now, we are ready to actually calculate the outputs.. plase remember that, all this while, the actual 
	// values of the parametrs of nodes in layer 1 and layer 5 are the ones that were randomly initialized.

		for(j = 0; j < training_data_n; j++)
		{ 
			//printf("testing 1\n");	
			put_input_data(node_p,j, training_data_matrix); //input.c
			//printf("testing 2 \n");	
			for(k = 0; k < Mf_n*In_n + 3*Rule_n; k++)
			{
				*node_p[k + In_n]->value = layer_1_to_4_output[j][k];
			}
	// u must be able to see that in the above two loops, each time, whatever output we got for a given training 
	// datta pair, it was safely stored in layer_1_to_4 array...and each time, the value on the actual nodes in the
	// structure got changed.. due to new incoming training data pair..this was periodic with period trainingdata_n..
	// that is for each output node, we got the same results for a given training dat aapir.. that is the node values
	// were independent of m. Now, for a given traing data pair, we are getting those value back in the actual node 
	// node structure from that laye blh blah matrix..

			//printf("testing 3\n");	
			put_kalman_parameter(Out_n,kalman_parameter); //kalman.c
	// using this function call, we are placing the setimated value of the layer 5 parametrs in the node structure
	// by accessing each node and its parameter list.
			// Do forward pass for L5 and L6
			calculate_output(In_n + In_n*Mf_n + 3*Rule_n, Node_n, j); //forward.c
	// for a given value of the training data pair, this function calculates the output of layer 5 and layer 6 nodes 
	// and places them in the node structure.

			calculate_root(training_data_matrix,diff,j,node_p); //debug.c
	// this function call calculates the square of the erro between the predicted value of an output node and the 
	// corresponding actual value in the training data matrix..(actual output) and stores it in diff.
	// this call performs the above action for a given training data pair and for all output nodes.

			// Do backward pass and calculate all error rate for each node
			calculate_de_do(j,Out_n,target,ancfis_output); //backward.c
	// calculates de_do for each node fora given training data pair
			update_de_dp(j); //de_dp.c	
	// updates de_do for each node....
		}
	// thus at the end of this loop, estimated outputs for all the training data are calculated..also back propogatin 
	// is done and de_dp for all the nodes is updated.
		
		//printf("testing 1\n");	
		calculate_trn_err(diff,trn_error,trn_datapair_error,training_data_n); //debug.c
		//printf("testing 2 \n");	
		//training_error_measure(target,ancfis_output,diff, training_data_n, trn_error,out_n); //trn_err.c
		trn_rmse_error[ep_n] = trn_error[Out_n];
		printf("%3d \t %.11f \n", ep_n+1, trn_error[Out_n]);
		//Find RMSE of testing error
	/*************************************	if(checking_data_n != 0) 
		{
			printf("testing 3 \n");	
			epoch_checking_error(checking_data_matrix, checking_data_n, chk_error, training_data_n, chk_output, ep_n); //chk_err.c  writes to tracking.txt
			printf("testing 4 \n");	
			chk_rmse_error[ep_n] = chk_error[Out_n];
			for (i=0; i<Out_n; i++)
			//printf("%3d \t %.11f \t %.11f\n", ep_n+1, trn_error[i], chk_error[i]);
			printf("%3d \t %.11f \n", ep_n+1, trn_error[i]);
			//printf("%.11f\t %.11f\n", trn_error[Out_n],chk_error[Out_n]);
			printf("%.11f\t %.11f\n", trn_error[Out_n]);
			write_result(ep_n+1,Out_n,trn_error,chk_error);  //debug.c writes to result.txt
		} 
		else 
		{
			for (i=0; i<Out_n; i++)	
			printf("%4d \t %.11f\n", ep_n+1, trn_error[i]);
		}
***************************/

/**
		//Find minimum training error and its epoch-number
		if(trn_rmse_error[ep_n] < min_trn_RMSE) {
			min_trn_RMSE_epoch = ep_n +1;
			min_trn_RMSE = trn_rmse_error[ep_n];
			record_parameter(parameter_array);
		}

		if(ep_n < epoch_n-1)
		{ 
			//update parameters in 1st layer (Using VNCSA)
			update_parameter(1, step_size); //new_para.c
			//update stepsize
			update_step_size(trn_rmse_error, ep_n, &step_size, decrease_rate, increase_rate); //stepsize.c
		}
	}
***/
////////////////////////////////////////////////////////////

fppp = (FILE *)open_file("status.txt", "w");
fpppp = (FILE *)open_file("trn.txt", "w");


	ep_n=0;

	do
	{
		//step_size_pointer= &step_size;		
		printf("epoch numbernumber %d \n", ep_n+1);	
		//step_size_array[ep_n] = step_size_pointer;
		step_size_array[ep_n] = step_size;
	// after the above step, the updated stepsize at the end of the last loop is stored in the step_size_array.
	// this will keep happening every time we start en epoch and hence at the end of the loop, step_size_array will 
	// have a list of all the updated step sizes. Since this is a offline version, step sizes are updated only
	// at the end of an epoch. 
		for(m = 0; m < Out_n; m++)
		{ 	
			//printf("m loop number %d \n", m);	
			for(j = 0; j < training_data_n; j++)
			{ 
				//printf("j loop number %d \n", j);				
				//copy the input vector(s) to input node(s)
				put_input_data(node_p,j, training_data_matrix); //input.c
	// after this(above) step, the input data is transferred frm the training data matrix to the "node" structure.
				//printf("testing \n");	
				//printf("reeeetesting \n");	
				target[m] = training_data_matrix[j][(m+1)*In_vect_n+m]; // *** 
	// this step assigns the value of the "m"th output of "j" th trainig data pair to target.
				//printf("testing \n");	
				//forward pass, get node outputs from layer 1 to layer 4
				calculate_output(In_n, In_n + In_n*Mf_n + 3*Rule_n, j); //forward.c
	// after this step, output of nodes in layer 1 to 4 is calculated. Please note that when this happens for the first
	// time, i.e. when ep_n=0, our network parametrs are already initialized. thus, it is possible to get the
	// output of each node using the function definitios proposed in forward.c. After first epoch, our parametrs get 
	// updated and this output is then calculated using teh new parameters. The essential point to note here is that
	// we can always calculate the output of each node since we have already initialized our parameters.
				//printf("testing \n");	
				//put outputs of layer 1 to 4 into layer_1_to_4_output
		
				for(k = 0; k < Mf_n*In_n + 3*Rule_n; k++)
				{
				//printf("testing \n");	
				layer_1_to_4_output[j][k] = *node_p[k + In_n]->value;
				//fprintf(fppp, "%lf \t %lf \t \n", (layer_1_to_4_output[j][k]).real, (layer_1_to_4_output[j][k]).imag);
				}
	// the above loop simply puts the values of nodes from layer 1 to layer 4 in the layer_1_to_4_output matrix.

				//identify layer 5 params using LSE (Kalman filter)
				//printf("testing \n");	
				get_kalman_data(kalman_data, target); //kalman.c
	// this function call finds out the values of O4iXnl .. these are basically the coefficients
	// of the kalman parametrs for a given training data pair
	//puts them in kalman_data matrix.
	// this kalman_data matrix has In_n number of rows and number of columns equal to number of parametrs that are
	// responsible for determining each output... as stated above, the outputs are actually the coefficients of the
	// parameters.

				//printf("testing \n");	
				//calculate Kalman parameters
				
				kalman(ep_n, j+(m*training_data_n), m, kalman_data, kalman_parameter,target); //kalman.c
	// this function call evaluates kalman parametrs for a given output, for a given epoch.. that is it takes the epoch 
	// number from us, takes the info about how many times has kalman been invoked before, also takes in the
	// output number(row number) for whihc the parametrs are to be found out... it also takes kalman_data and reads 
	// from it to estimate the kalman parameters... it also takes target .. and stores the output in the mth row of 
	// kalman_parameter.
				//printf("testing \n");	
			}
	// let me tell u what the abopve loop is doing.. after observing closely, it is easy to see that in the above loop, 
	// for a given output node, one by one, all the training data are taken inside the ANCFIS structure, outputs
	// are calculated from one to 4, then a recursive kalman filetr is used to identify the kalman
	// parametrs corresponding to the output node.. these kalman parameters are updated after every tarining data pair 
	// and finally at the end of all the training data, we have an estimate for the kalman parametrs corresponding to 		// the output node.
		}
	// thus, at the of the above loop, the kalman parametrs for all the output nodes are evaluated...

	// now, we are ready to actually calculate the outputs.. plase remember that, all this while, the actual 
	// values of the parametrs of nodes in layer 1 and layer 5 are the ones that were randomly initialized.

		for(j = 0; j < training_data_n; j++)
		{ 
			//printf("testing 1\n");	
			put_input_data(node_p,j, training_data_matrix); //input.c
			//printf("testing 2 \n");	
			for(k = 0; k < Mf_n*In_n + 3*Rule_n; k++)
			{
				*node_p[k + In_n]->value = layer_1_to_4_output[j][k];
				/*if(ep_n==1)
				{
				fprintf(fppp, "%d.\t %lf \t + \t i%lf \n", k, (layer_1_to_4_output[j][k]).real,(layer_1_to_4_output[j][k]).imag);
				}*/
			}
	// u must be able to see that in the above two loops, each time, whatever output we got for a given training 
	// datta pair, it was safely stored in layer_1_to_4 array...and each time, the value on the actual nodes in the
	// structure got changed.. due to new incoming training data pair..this was periodic with period trainingdata_n..
	// that is for each output node, we got the same results for a given training dat aapir.. that is the node values
	// were independent of m. Now, for a given traing data pair, we are getting those value back in the actual node 
	// node structure from that laye blh blah matrix..

			//printf("testing 3\n");	
			put_kalman_parameter(Out_n,kalman_parameter); //kalman.c
			//printf("hihahahha \n");
	// using this function call, we are placing the setimated value of the layer 5 parametrs in the node structure
	// by accessing each node and its parameter list.
			// Do forward pass for L5 and L6
			calculate_output(In_n + In_n*Mf_n + 3*Rule_n, Node_n, j); //forward.c
	// for a given value of the training data pair, this function calculates the output of layer 5 and layer 6 nodes 
	// and places them in the node structure.
			//printf("hihahahha  no 2 \n");
	calculate_root(training_data_matrix,diff,j,node_p); //debug.c
	// this function call calculates the square of the erro between the predicted value of an output node and the 
	// corresponding actual value in the training data matrix..(actual output) and stores it in diff.
	// this call performs the above action for a given training data pair and for all output nodes.

			// Do backward pass and calculate all error rate for each node
			calculate_de_do(j,Out_n,target,ancfis_output); //backward.c
			//printf("hihahahha no 3 \n");
	// calculates de_do for each node fora given training data pair
			update_de_dp(j); //de_dp.c	
	// updates de_do for each node....
		}
	// thus at the end of this loop, estimated outputs for all the training data are calculated..also back propogatin 
	// is done and de_dp for all the nodes is updated.
		
		//printf("testing 1\n");	
		calculate_trn_err(diff,trn_error, trn_datapair_error, training_data_n); //debug.c
		//printf("testing 2 \n");	
		//training_error_measure(target,ancfis_output,diff, training_data_n, trn_error,out_n); //trn_err.c
		trn_rmse_error[ep_n] = trn_error[Out_n];
		trnNMSE[ep_n] = trn_rmse_error[ep_n]*trn_rmse_error[ep_n]/trnvariance;
		fprintf(fppp, "epoch number is %d \t trn RMSE is %.11f \t trn NMSE is  %lf \t \n", ep_n + 1,  trn_rmse_error[ep_n], trnNMSE[ep_n]);
		//fprintf(fpppp, "\n");
		fprintf(fpppp, "epoch number is %d \t trn RMSE is %.11f \t trn NMSE is  %lf \t \n", ep_n + 1,  trn_rmse_error[ep_n], trnNMSE[ep_n]);
		printf("trn RMSE is \t %lf \n", trn_rmse_error[ep_n]);
		printf("trn NMSE is \t %lf \n", trnNMSE[ep_n]);
		for(i=0; i<training_data_n; i++)
		{
		trn_datapair_error_sorted[0][i]=trn_datapair_error[i];
		trn_datapair_error_sorted[1][i]= i+1;
		}

		for(j=1; j<training_data_n; j++)
		{		
		for(i=0; i<training_data_n-j; i++)
		{
		if(trn_datapair_error_sorted[0][i]>trn_datapair_error_sorted[0][i+1])
		{	
		sorting=trn_datapair_error_sorted[0][i+1];
		trn_datapair_error_sorted[0][i+1]=trn_datapair_error_sorted[0][i];
		trn_datapair_error_sorted[0][i]=sorting;
		sortingindex = sorting=trn_datapair_error_sorted[1][i+1];
		trn_datapair_error_sorted[1][i+1]=trn_datapair_error_sorted[1][i];
		trn_datapair_error_sorted[1][i]=sortingindex;
		}
		}
		}

		for(j=0; j<training_data_n; j++)
		{
		fprintf(fppp, "\n");		
		fprintf(fppp, "training data pair sorted number \t %d \n", j+1);
		fprintf(fppp, "training data pair original number \t %d \n", (int)(trn_datapair_error_sorted[1][j]));
		fprintf(fppp, "training data pair sorted error in RMSE is \t %lf \n",trn_datapair_error_sorted[0][j]);
		fprintf(fpppp, "%d \t", (int)(trn_datapair_error_sorted[1][j]));
		complexsum = complex(0.0, 0.0);
		fprintf(fppp,"Normalized layer 3 outputs are as follows \n");
		for(k= In_n*Mf_n + Rule_n; k< In_n*Mf_n + 2*Rule_n; k++)
		{
		fprintf(fppp, "%d.\t %lf + i%lf \t %lf < %lf \n", k, (layer_1_to_4_output[j][k]).real,(layer_1_to_4_output[j][k]).imag, c_abs(layer_1_to_4_output[j][k]), c_phase(layer_1_to_4_output[j][k])*180/PI);
		complexsum = c_add(complexsum, layer_1_to_4_output[j][k]);
		}
		
		
		fprintf(fppp, "Sum of the outputs of layer 3 is \t %lf+i%lf \t %lf<%lf \n", complexsum.real, complexsum.imag, c_abs(complexsum), c_phase(complexsum)*180/PI);
		complexsum = complex(0.0, 0.0);
		fprintf(fppp,"dot producted layer 4 outputs are as follows \n");
		for(k=In_n*Mf_n + 2*Rule_n; k< In_n*Mf_n + 3*Rule_n; k++)
		{
		
		fprintf(fppp, "%d.\t %lf + i%lf \t %lf < %lf \n", k, (layer_1_to_4_output[j][k]).real,(layer_1_to_4_output[j][k]).imag, c_abs(layer_1_to_4_output[j][k]), c_phase(layer_1_to_4_output[j][k])*180/PI);
		complexsum = c_add(complexsum, layer_1_to_4_output[j][k]);
		}

		fprintf(fppp, "sum of the outputs of layer 4 is \t %lf +i%lf \t %lf<%lf \n", complexsum.real, complexsum.imag, c_abs(complexsum), c_phase(complexsum)*180/PI);
		if(j> training_data_n -6 )
		{
		trnnumcheck[(int)(trn_datapair_error_sorted[1][j])]= trnnumcheck[(int)(trn_datapair_error_sorted[1][j])] +1;
		}
		if(j<5 )
		{
		trnnumchecku[(int)(trn_datapair_error_sorted[1][j])]= trnnumchecku[(int)(trn_datapair_error_sorted[1][j])] +1;
		}

		}
		fprintf(fpppp, "\n");
		
		//Find RMSE of testing error
/********************************************************************************
if(checking_data_n != 0) 
		{
			printf("testing 3 \n");	
			epoch_checking_error(checking_data_matrix, checking_data_n, chk_error, training_data_n, chk_output, ep_n); //chk_err.c  writes to tracking.txt
			printf("testing 4 \n");	
			chk_rmse_error[ep_n] = chk_error[Out_n];
			for (i=0; i<Out_n; i++)
			printf("%3d \t %.11f \t %.11f\n", ep_n+1, trn_error[i], chk_error[i]);
			printf("%.11f\t %.11f\n", trn_error[Out_n],chk_error[Out_n]);
			write_result(ep_n+1,Out_n,trn_error,chk_error);  //debug.c writes to result.txt
		} 
		else 
		{
			for (i=0; i<Out_n; i++)	
			printf("%4d \t %.11f\n", ep_n+1, trn_error[i]);
		}
**************************************************************************************/

		// check whether the current training RMSE is less than the threhold and store its epch number and parametrs
		
		if(trn_rmse_error[ep_n] < min_trn_RMSE) 
		{
			min_trn_RMSE_epoch = ep_n +1;
			min_trn_RMSE = trn_rmse_error[ep_n];
			min_trnNMSE = trnNMSE[ep_n];
			record_parameter(parameter_array);
		}

		if(ep_n < epoch_n-1)
		{ 
			//update parameters in 1st layer (Using VNCSA)
			update_parameter(1, step_size); //new_para.c
			//update stepsize
			update_step_size(trn_rmse_error, ep_n, &step_size, decrease_rate, increase_rate); //stepsize.c
		}
		ep_n++;
		
	} while((trnNMSE[ep_n -1]>= threshold) && (ep_n <= epoch_n -1));

for(i=1; i<=training_data_n; i++)
{
	fprintf(fpppp, "%d \t %d \n", i, trnnumcheck[i]);
}
for(i=1; i<=training_data_n; i++)
{
	fprintf(fpppp, "%d \t %d \n", i, trnnumchecku[i]);
}


if(trnNMSE[ep_n -1]< threshold)
{
fprintf(fppp, "\n");
fprintf(fppp, "We have gone below the threshold value \n");
fprintf(fppp, "the epoch number in which this happened is %d \n", min_trn_RMSE_epoch);
}
else
{
fprintf(fppp, "\n");
fprintf(fppp, "We exhausted the available epochs and threshold was not broken :( \n");
fprintf(fppp, "the epoch number which yielded minimum training RMSE is %d \n", min_trn_RMSE_epoch);
}


fclose(fppp);
fclose(fpppp);

double *minmaxc;
minmaxc= (double *)calloc(2*In_n, sizeof(double));
	
	if((fpp = fopen("minmax.txt", "r")) == NULL)
	{
		printf("Cannot open 'parameter_file'.\n");
	}

	for(j = 0; j < 2*In_n; j++)
	{
		if(fscanf(fpp, "%lf", &tmp) != EOF) 
		{
				minmaxc[j] = tmp;
				
		} else 		{
			printf("Not enough data in 'input_parameter'!");
		}
	}

	fclose(fpp);
//////////////////////////////////////////////////////////////


	restore_parameter(parameter_array); //output.c
	write_parameter(FINA_PARA_FILE); //output.c
	write_array(trnNMSE, epoch_n, TRAIN_ERR_FILE); //lib.c
	if (checking_data_n != 0)
	{
		//printf("testing 3 \n");	
		epoch_checking_error(checking_data_matrix, checking_data_n, chk_error_n, chk_error_un, training_data_n, chk_output, ep_n -1, minmaxc); //chk_err.c  writes to tracking.txt
		//printf("testing 4 \n");	
		//chk_rmse_error[ep_n] = chk_error[Out_n];
		min_chk_RMSE_n = chk_error_n[Out_n];
		printf(" initial checking RMSE is %lf \n ", min_chk_RMSE_n);
		min_chk_RMSE_un = chk_error_un[Out_n];
		//for (i=0; i<Out_n; i++)
		//printf("%3d \t %.11f \t %.11f\n", ep_n+1, trn_error[i], chk_error[i]);
		//printf("%3d \t %.11f \n", ep_n+1, trn_error[i]);
			//printf("%.11f\t %.11f\n", trn_error[Out_n],chk_error[Out_n]);
		//printf("%.11f\t \n", trn_error[Out_n]);
		//write_result(min_trn_RMSE_epoch ,Out_n,trn_rmse_error,chk_error);  //debug.c writes to result.txt about the epoch number at which the stopping was done and the corresponding training RMSE and checking RMSE
	} 
	//write_array(chk_rmse_error, epoch_n, CHECK_ERR_FILE); //lib.c
	//}
	
	write_array(step_size_array, epoch_n, STEP_SIZE_FILE); //lib.c

/**************************************************************************
	min_chk_RMSE = chk_rmse_error[epoch_n -1];
	min_chk_RMSE_epoch = epoch_n -1;	
	for(j=0; j< epoch_n; j++)
	{
	if(chk_rmse_error[j]< min_chk_RMSE)
	{
	min_chk_RMSE = chk_rmse_error[j];
	min_chk_RMSE_epoch = j;
	}
	}
*************************************************************************/
/**************************************************************
	double minmaxc[2*In_n];
	
	if((fpp = fopen("minmax.txt", "r")) == NULL)
	{
		printf("Cannot open 'parameter_file'.\n");
	}

	for(j = 0; j < 2*In_n; j++)
	{
		if(fscanf(fpp, "%lf", &tmp) != EOF) 
		{
				minmaxc[j] = tmp;
				
		} else 		{
			printf("Not enough data in 'input_parameter'!");
		}
	}

	fclose(fpp);
***************************************************************************/
	for(k=0; k< Out_n; k++)
	{
	for(j=0; j< checking_data_n; j++)
		{
		 checking_data_matrix_un[j][k]= (checking_data_matrix[j][(k+1)*In_vect_n +k])* (minmaxc[(2*k) +1] - minmaxc[2*k]) + minmaxc[2*k];
		}
	}





// the following code calculates the cdavg_un and checking datat average bothe un normalized
for(k=0; k< Out_n; k++)
	{
	for(j=0; j< checking_data_n; j++)
		{
		checking_data_average_un = checking_data_average_un + checking_data_matrix_un[j][k];
		}
	cdavg_un[k]=checking_data_average_un/checking_data_n;
	checking_data_average_un_temp=checking_data_average_un_temp+checking_data_average_un;
	checking_data_average_un=0;
		}
	
	checking_data_average_un = checking_data_average_un_temp/(Out_n*checking_data_n);
	printf("%lf is the checking datat average non normalized\n", checking_data_average_un);





// the following code calcuates the chkvar_un un normalized 
for(k=0; k< Out_n; k++)
	{	
	for(j=0; j< checking_data_n; j++)
		{				
		temp= temp + (checking_data_matrix_un[j][k] - cdavg_un[k])*(checking_data_matrix_un[j][k] - cdavg_un[k]);
		}
	chkvar_un[k]=temp/(checking_data_n-1);
	//checking_variance_un = checking_variance_un + temp;
	temp=0;
	}




temp =0.0;
// the following code cacluates the un normalized checking varinace
for(j=0; j< checking_data_n; j++)
	{
	for(k=0; k< Out_n; k++)
		{
		temp = checking_data_matrix_un[j][k] - checking_data_average_un;
		temp = temp*temp;
		checking_variance_un = checking_variance_un + temp;
		}
	}
	checking_variance_un = checking_variance_un/((Out_n*checking_data_n)-1);
	printf("%lf is the checking variance non normalized \n", checking_variance_un);

temp =0.0;




checking_data_average_n=0.0;
checking_data_average_n_temp=0.0;
// the following code calculates the cdavg and checking data average both normalized
for(k=0; k< Out_n; k++)
	{	
for(j=0; j< checking_data_n; j++)
		{		
		checking_data_average_n = checking_data_average_n + checking_data_matrix[j][(k+1)*In_vect_n +k];
		}
		cdavg[k]=checking_data_average_n/checking_data_n;
		checking_data_average_n_temp=checking_data_average_n_temp+checking_data_average_n;
		checking_data_average_n=0;
	}
	checking_data_average_n = checking_data_average_n_temp/(Out_n*checking_data_n);
	printf("%lf is the checking datat average  normalized\n", checking_data_average_n);


temp =0.0;
checking_variance_n =0.0;
// the following code cacluates the normalized checking varinace
for(j=0; j< checking_data_n; j++)
	{
	for(k=0; k< Out_n; k++)
		{
		temp = checking_data_matrix[j][(k+1)*In_vect_n +k] - checking_data_average_n;
		temp = temp*temp;
		checking_variance_n = checking_variance_n + temp;
		}
	}
checking_variance_n = checking_variance_n/((Out_n*checking_data_n)-1);
temp = 0.0;
printf("%lf is the checking variance normalized \n", checking_variance_n);



// the following code calcuatres the normalized chkvar[k]
temp=0.0;
for(k=0; k< Out_n; k++)
	{
	for(j=0; j< checking_data_n; j++)
		{	
		temp= temp + (checking_data_matrix[j][(k+1)*In_vect_n +k] - cdavg[k])*(checking_data_matrix[j][(k+1)*In_vect_n +k] - cdavg[k]);
	}
	chkvar[k]=temp/(checking_data_n-1);
	//checking_variance_n = checking_variance_n + temp;
	temp=0;
	}


	
	
	

	NMSE_un = min_chk_RMSE_un * min_chk_RMSE_un / checking_variance_un;
	NMSE_n = min_chk_RMSE_n * min_chk_RMSE_n / checking_variance_n;
	NMSE_n2 = min_chk_RMSE_n * min_chk_RMSE_n / chkvariance;
	NDEI_un = sqrt(NMSE_un);
	NDEI_n = sqrt(NMSE_n);




	for(k=0;k<Out_n;k++)
	{
	NMSE[k]=chk_error_n[k]*chk_error_n[k]/chkvar[k];
	NDEI[k]=sqrt(NMSE[k]);
	unNMSE[k]=chk_error_un[k]*chk_error_un[k]/chkvar_un[k];
	unNDEI[k]=sqrt(unNMSE[k]);
	}






	write_result(min_trn_RMSE_epoch ,Out_n,trn_rmse_error,chk_error_un, chk_error_n, NDEI_un, NMSE_un, NDEI_n, NMSE_n, NMSE, NDEI, unNMSE, unNDEI); //debug.c writes to result.txt about the epoch number at which the stopping was done and the corresponding training RMSE and checking RMSE
	printf("Minimum training RMSE is \t %f \t \n",min_trn_RMSE); 
	printf("Minimum training RMSE epoch is \t %d \n",min_trn_RMSE_epoch); 
	printf("Minimum training NMSE is \t %f \t \n",min_trnNMSE); 
	//printf("Minimum training RMSE epoch is \t %d \n",min_trnNMSE_epoch); 
	//printf("Minimum training RMSE is \t %f \t \n",min_trn_RMSE); 
	//printf("Minimum training RMSE epoch is \t %d \n",min_trn_RMSE_epoch); 
	printf("%f \t is the checking RMSE non normalized\n",min_chk_RMSE_un);
	printf("%f \t is the checking RMSE normalized\n",min_chk_RMSE_n);
	//printf("%f \t is the checking RMSE normalized22222222 \n",min_chk_RMSE_n2);
	printf(" checking NMSE non normlized is %f \t NDEI non normalized is %f \n",NMSE_un, NDEI_un); 
	printf("checking NMSE normalized is %f \t NDEI normalized is %f \n",NMSE_n, NDEI_n); 
	printf("checking NMSE2 normalized is %f \n",NMSE_n2); 
	printf("traning data variance is  %f \n",trnvariance); 
	return(0);
예제 #23
0
/*! \brief
 * <pre>
 * Purpose
 * =======
 *    ilu_cdrop_row() - Drop some small rows from the previous 
 *    supernode (L-part only).
 * </pre>
 */
int ilu_cdrop_row(
	superlu_options_t *options, /* options */
	int    first,	    /* index of the first column in the supernode */
	int    last,	    /* index of the last column in the supernode */
	double drop_tol,    /* dropping parameter */
	int    quota,	    /* maximum nonzero entries allowed */
	int    *nnzLj,	    /* in/out number of nonzeros in L(:, 1:last) */
	double *fill_tol,   /* in/out - on exit, fill_tol=-num_zero_pivots,
			     * does not change if options->ILU_MILU != SMILU1 */
	GlobalLU_t *Glu,    /* modified */
	float swork[],   /* working space
	                     * the length of swork[] should be no less than
			     * the number of rows in the supernode */
	float swork2[], /* working space with the same size as swork[],
			     * used only by the second dropping rule */
	int    lastc	    /* if lastc == 0, there is nothing after the
			     * working supernode [first:last];
			     * if lastc == 1, there is one more column after
			     * the working supernode. */ )
{
    register int i, j, k, m1;
    register int nzlc; /* number of nonzeros in column last+1 */
    register int xlusup_first, xlsub_first;
    int m, n; /* m x n is the size of the supernode */
    int r = 0; /* number of dropped rows */
    register float *temp;
    register complex *lusup = (complex *) Glu->lusup;
    register int *lsub = Glu->lsub;
    register int *xlsub = Glu->xlsub;
    register int *xlusup = Glu->xlusup;
    register float d_max = 0.0, d_min = 1.0;
    int    drop_rule = options->ILU_DropRule;
    milu_t milu = options->ILU_MILU;
    norm_t nrm = options->ILU_Norm;
    complex zero = {0.0, 0.0};
    complex one = {1.0, 0.0};
    complex none = {-1.0, 0.0};
    int i_1 = 1;
    int inc_diag; /* inc_diag = m + 1 */
    int nzp = 0;  /* number of zero pivots */
    float alpha = pow((double)(Glu->n), -1.0 / options->ILU_MILU_Dim);

    xlusup_first = xlusup[first];
    xlsub_first = xlsub[first];
    m = xlusup[first + 1] - xlusup_first;
    n = last - first + 1;
    m1 = m - 1;
    inc_diag = m + 1;
    nzlc = lastc ? (xlusup[last + 2] - xlusup[last + 1]) : 0;
    temp = swork - n;

    /* Quick return if nothing to do. */
    if (m == 0 || m == n || drop_rule == NODROP)
    {
	*nnzLj += m * n;
	return 0;
    }

    /* basic dropping: ILU(tau) */
    for (i = n; i <= m1; )
    {
	/* the average abs value of ith row */
	switch (nrm)
	{
	    case ONE_NORM:
		temp[i] = scasum_(&n, &lusup[xlusup_first + i], &m) / (double)n;
		break;
	    case TWO_NORM:
		temp[i] = scnrm2_(&n, &lusup[xlusup_first + i], &m)
		    / sqrt((double)n);
		break;
	    case INF_NORM:
	    default:
		k = icamax_(&n, &lusup[xlusup_first + i], &m) - 1;
		temp[i] = c_abs1(&lusup[xlusup_first + i + m * k]);
		break;
	}

	/* drop small entries due to drop_tol */
	if (drop_rule & DROP_BASIC && temp[i] < drop_tol)
	{
	    r++;
	    /* drop the current row and move the last undropped row here */
	    if (r > 1) /* add to last row */
	    {
		/* accumulate the sum (for MILU) */
		switch (milu)
		{
		    case SMILU_1:
		    case SMILU_2:
			caxpy_(&n, &one, &lusup[xlusup_first + i], &m,
				&lusup[xlusup_first + m - 1], &m);
			break;
		    case SMILU_3:
			for (j = 0; j < n; j++)
			    lusup[xlusup_first + (m - 1) + j * m].r +=
				    c_abs1(&lusup[xlusup_first + i + j * m]);
			break;
		    case SILU:
		    default:
			break;
		}
		ccopy_(&n, &lusup[xlusup_first + m1], &m,
                       &lusup[xlusup_first + i], &m);
	    } /* if (r > 1) */
	    else /* move to last row */
	    {
		cswap_(&n, &lusup[xlusup_first + m1], &m,
			&lusup[xlusup_first + i], &m);
		if (milu == SMILU_3)
		    for (j = 0; j < n; j++) {
			lusup[xlusup_first + m1 + j * m].r =
				c_abs1(&lusup[xlusup_first + m1 + j * m]);
			lusup[xlusup_first + m1 + j * m].i = 0.0;
                    }
	    }
	    lsub[xlsub_first + i] = lsub[xlsub_first + m1];
	    m1--;
	    continue;
	} /* if dropping */
	else
	{
	    if (temp[i] > d_max) d_max = temp[i];
	    if (temp[i] < d_min) d_min = temp[i];
	}
	i++;
    } /* for */

    /* Secondary dropping: drop more rows according to the quota. */
    quota = ceil((double)quota / (double)n);
    if (drop_rule & DROP_SECONDARY && m - r > quota)
    {
	register double tol = d_max;

	/* Calculate the second dropping tolerance */
	if (quota > n)
	{
	    if (drop_rule & DROP_INTERP) /* by interpolation */
	    {
		d_max = 1.0 / d_max; d_min = 1.0 / d_min;
		tol = 1.0 / (d_max + (d_min - d_max) * quota / (m - n - r));
	    }
	    else /* by quick select */
	    {
		int len = m1 - n + 1;
		scopy_(&len, swork, &i_1, swork2, &i_1);
		tol = sqselect(len, swork2, quota - n);
#if 0
		register int *itemp = iwork - n;
		A = temp;
		for (i = n; i <= m1; i++) itemp[i] = i;
		qsort(iwork, m1 - n + 1, sizeof(int), _compare_);
		tol = temp[itemp[quota]];
#endif
	    }
	}

	for (i = n; i <= m1; )
	{
	    if (temp[i] <= tol)
	    {
		register int j;
		r++;
		/* drop the current row and move the last undropped row here */
		if (r > 1) /* add to last row */
		{
		    /* accumulate the sum (for MILU) */
		    switch (milu)
		    {
			case SMILU_1:
			case SMILU_2:
			    caxpy_(&n, &one, &lusup[xlusup_first + i], &m,
				    &lusup[xlusup_first + m - 1], &m);
			    break;
			case SMILU_3:
			    for (j = 0; j < n; j++)
				lusup[xlusup_first + (m - 1) + j * m].r +=
   				  c_abs1(&lusup[xlusup_first + i + j * m]);
			    break;
			case SILU:
			default:
			    break;
		    }
		    ccopy_(&n, &lusup[xlusup_first + m1], &m,
			    &lusup[xlusup_first + i], &m);
		} /* if (r > 1) */
		else /* move to last row */
		{
		    cswap_(&n, &lusup[xlusup_first + m1], &m,
			    &lusup[xlusup_first + i], &m);
		    if (milu == SMILU_3)
			for (j = 0; j < n; j++) {
			    lusup[xlusup_first + m1 + j * m].r =
				    c_abs1(&lusup[xlusup_first + m1 + j * m]);
			    lusup[xlusup_first + m1 + j * m].i = 0.0;
                        }
		}
		lsub[xlsub_first + i] = lsub[xlsub_first + m1];
		m1--;
		temp[i] = temp[m1];

		continue;
	    }
	    i++;

	} /* for */

    } /* if secondary dropping */

    for (i = n; i < m; i++) temp[i] = 0.0;

    if (r == 0)
    {
	*nnzLj += m * n;
	return 0;
    }

    /* add dropped entries to the diagnal */
    if (milu != SILU)
    {
	register int j;
	complex t;
	float omega;
	for (j = 0; j < n; j++)
	{
	    t = lusup[xlusup_first + (m - 1) + j * m];
            if (t.r == 0.0 && t.i == 0.0) continue;
            omega = SUPERLU_MIN(2.0 * (1.0 - alpha) / c_abs1(&t), 1.0);
	    cs_mult(&t, &t, omega);

 	    switch (milu)
	    {
		case SMILU_1:
		    if ( !(c_eq(&t, &none)) ) {
                        c_add(&t, &t, &one);
                        cc_mult(&lusup[xlusup_first + j * inc_diag],
			                  &lusup[xlusup_first + j * inc_diag],
                                          &t);
                    }
		    else
		    {
                        cs_mult(
                                &lusup[xlusup_first + j * inc_diag],
			        &lusup[xlusup_first + j * inc_diag],
                                *fill_tol);
#ifdef DEBUG
			printf("[1] ZERO PIVOT: FILL col %d.\n", first + j);
			fflush(stdout);
#endif
			nzp++;
		    }
		    break;
		case SMILU_2:
                    cs_mult(&lusup[xlusup_first + j * inc_diag],
                                          &lusup[xlusup_first + j * inc_diag],
                                          1.0 + c_abs1(&t));
		    break;
		case SMILU_3:
                    c_add(&t, &t, &one);
                    cc_mult(&lusup[xlusup_first + j * inc_diag],
	                              &lusup[xlusup_first + j * inc_diag],
                                      &t);
		    break;
		case SILU:
		default:
		    break;
	    }
	}
	if (nzp > 0) *fill_tol = -nzp;
    }

    /* Remove dropped entries from the memory and fix the pointers. */
    m1 = m - r;
    for (j = 1; j < n; j++)
    {
	register int tmp1, tmp2;
	tmp1 = xlusup_first + j * m1;
	tmp2 = xlusup_first + j * m;
	for (i = 0; i < m1; i++)
	    lusup[i + tmp1] = lusup[i + tmp2];
    }
    for (i = 0; i < nzlc; i++)
	lusup[xlusup_first + i + n * m1] = lusup[xlusup_first + i + n * m];
    for (i = 0; i < nzlc; i++)
	lsub[xlsub[last + 1] - r + i] = lsub[xlsub[last + 1] + i];
    for (i = first + 1; i <= last + 1; i++)
    {
	xlusup[i] -= r * (i - first);
	xlsub[i] -= r;
    }
    if (lastc)
    {
	xlusup[last + 2] -= r * n;
	xlsub[last + 2] -= r;
    }

    *nnzLj += (m - r) * n;
    return r;
}
예제 #24
0
void schurFactorization(long n, complex **A, complex **T, complex **U)
{

  /* Schur factorization: A = U*T*U', T = upper triangular, U = unitary */
            
  long i,j,iter,maxIter;
  double tol, diff1,diff2; 
  complex T11, T12, T21, T22; 
  complex sigma1, sigma2, sigma; 
  complex z, z1, z2; 
  complex **P, **Q, **R;


  /* Allocate auxiliary matrices */

  P     = (complex **)Mem(MEM_ALLOC,n, sizeof(complex *)); 
  Q     = (complex **)Mem(MEM_ALLOC,n, sizeof(complex *)); 
  R     = (complex **)Mem(MEM_ALLOC,n, sizeof(complex *)); 

  for (i=0; i<n; i++){
    P[i]     = (complex *)Mem(MEM_ALLOC,n, sizeof(complex)); 
    Q[i]     = (complex *)Mem(MEM_ALLOC,n, sizeof(complex)); 
    R[i]     = (complex *)Mem(MEM_ALLOC,n, sizeof(complex)); 
  }

  /* ------------------------------------------------------------*/

  /* Parameters for iteration */

   maxIter = 500;
   tol     = 1E-30; 

  /* ------------------------------------------------------------*/

  /* Init U = eye(n) (identity matrix) */

  for (i=0; i<n; i++){
    U[i][i].re = 1.0; 
    U[i][i].im = 0.0; 
  }  
  
  /* ------------------------------------------------------------*/

  /* Reduce A to Hessenberg form */

  hessFactorization(n,A,P,T); 

  /* ------------------------------------------------------------*/

  /* Compute Schur factorization of Hessenberg matrix T */


   for (j=n-1; j>0; j--){ /* Main loop */

     for (iter=0; iter<maxIter; iter++){ /* Iteration loop */

       sigma.re = T[j][j].re;
       sigma.im = T[j][j].im; 


       /* -- Use Wilkinson shift -- */

       /* submatrix considered in the shift */

       T11 = T[j-1][j-1];
       T12 = T[j-1][j];
       T21 = T[j][j-1];
       T22 = T[j][j];

       /* Compute eigenvalues of submatrix */

       z.re  = 0.0;
       z.im  = 0.0;
       z2.re = 0.0;
       z2.im = 0.0;

       /* z = T11*T11 + T22*T22 - 2*T11*T22 + 4*T12*T21 */

       z1 = c_mul(T11,T11);

       z  = c_add(z ,z1);
       z2 = c_add(z2,z1);
       
       z1 = c_mul(T22,T22);

       z  = c_add(z ,z1);
       z2 = c_add(z2,z1);

       z1 = c_mul(T11,T22);

       z1.re = -2.0 * z1.re;
       z1.im = -2.0 * z1.im;

       z  = c_add(z,z1);

       z1 = c_mul(T12,T21);
       z1.re = 4.0 * z1.re;
       z1.im = 4.0 * z1.im;
       z = c_add(z,z1);

       /* Square root*/

       z = c_sqrt(z);

       /* Eigenvalues */
       
       sigma1 = c_add(z2,z);
       sigma2 = c_sub(z2,z);

/*        printf("sigma1 = %e %e\n", sigma1.re, sigma1.im); */
/*        printf("sigma2 = %e %e\n", sigma2.re, sigma2.im); */

       /* Select eigenvalue for shift*/

       diff1 = c_norm( c_sub(T[j][j], sigma1) );
       diff2 = c_norm( c_sub(T[j][j], sigma2) );

       if (diff1 < diff2){
	 sigma.re = sigma1.re;
	 sigma.im = sigma1.im;
       }else{
	 sigma.re = sigma2.re;
	 sigma.im = sigma2.im;
       }

       /* --- QR step with Wilkinson shift --- */

       /* Shift: T(1:j,1:j) = T(1:j,1:j) - sigma * eye(j) */

       for (i=0; i<j+1; i++){

	 CheckValue(FUNCTION_NAME, "T[i][i].re","", T[i][i].re, -INFTY, INFTY);
	 CheckValue(FUNCTION_NAME, "T[i][i].im","", T[i][i].im, -INFTY, INFTY);	 

	 T[i][i].re = T[i][i].re - sigma.re;   
	 T[i][i].im = T[i][i].im - sigma.im;   
	 
       }

       /* Compute QR factorization of shifted Hessenberg matrix */

       for (i=0; i<n; i++){
	 memset(Q[i], 0, n*sizeof(complex));
	 memset(R[i], 0, n*sizeof(complex));
       }

       QRfactorization(n,T,Q,R); 

       /* T = T_new = R * Q  */

       for (i=0; i<n; i++){
	 memset(T[i], 0, n*sizeof(complex));
       }
       matProduct(n, n, n, R, Q, T);

       /* T(1:j,1:j) = T(1:j,1:j) + sigma * eye(j) */
       for (i=0; i<j+1; i++){
	 T[i][i].re = T[i][i].re + sigma.re;   
	 T[i][i].im = T[i][i].im + sigma.im;   
       }


       /* R =  U_new = U * Q */

       for (i=0; i<n; i++){
	 memset(R[i], 0, n*sizeof(complex));
       }       
       matProduct(n,n,n,U,Q,R); 

       /* U = R */

       for (i=0; i<n; i++){
	 memcpy(U[i],R[i], n*sizeof(complex));
       } 

       /* Check convergence */

       if (c_norm( T[j][j-1] ) <= tol * (c_norm(T[j-1][j-1]) + c_norm(T[j][j]))){
	 T[j][j-1].re = 0.0;
	 T[j][j-1].im = 0.0;
	 break; 
       }
       
   
     }	/* end of iter loop */  
    
   } /* end of main loop */


  /* -------------------------------------------------------------*/

   /* U = P*U */

   for (i=0; i<n; i++){
     memset(U[i], 0, n*sizeof(complex));
   }
   matProduct(n,n,n,P,R,U);
   

  /* -------------------------------------------------------------*/
  /* Free auxiliary variables */

   for (i=0; i<n; i++){
     Mem(MEM_FREE,P[i]); 
     Mem(MEM_FREE,Q[i]); 
     Mem(MEM_FREE,R[i]); 
   }

   Mem(MEM_FREE,P); 
   Mem(MEM_FREE,Q); 
   Mem(MEM_FREE,R); 

  /* Return */

  return;   
    
  
}
예제 #25
0
void QRfactorization(long n, complex **A, complex **Q, complex **R)
{

  /* QR factorization based on Householder transformations */
            
  long i,j,k,m;
  double nrm; 
  complex z,z1,z2;  
  complex *vj;

 /* Init. Q = eye(n) (identity matrix) */

  for (i=0; i<n; i++){
    Q[i][i].re = 1.0; 
    Q[i][i].im = 0.0; 
  }


  /* Init. R = A  */

  for(j=0; j<n; j++){
    for (i=0; i<n; i++){
      R[i][j].re = A[i][j].re;
      R[i][j].im = A[i][j].im;

      CheckValue(FUNCTION_NAME, "A[i][j].re","", A[i][j].re, -INFTY, INFTY);
      CheckValue(FUNCTION_NAME, "A[i][j].im","", A[i][j].im, -INFTY, INFTY);

    }
  }

  /* Allocate auxiliary variables*/

  vj = (complex *)Mem(MEM_ALLOC, n, sizeof(complex));

 /*  printf("begin calc, n = %d\n", n);  */

    /* ------------------------------------------------------------*/

  for (j=0; j<n; j++){ /* Main loop */

    /* R(j:end, j)  */

    for (i=j; i<n; i++){
      vj[i-j].re = R[i][j].re;
      vj[i-j].im = R[i][j].im;
    }
    
    nrm = vectorNorm(n-j, vj); 

    /* v(1) = v(1) + sign(R(j,j)) * norm(v) */

    vj[0].re = vj[0].re + R[j][j].re / c_norm(R[j][j]) * nrm; 
    vj[0].im = vj[0].im + R[j][j].im / c_norm(R[j][j]) * nrm;


    /* Update norm */

    nrm = vectorNorm(n-j, vj);  

    /* v = v./norm(v) */

    for (i=0; i<n-j; i++){
      vj[i].re = vj[i].re / nrm; 
      vj[i].im = vj[i].im / nrm; 
    }

    /* Update */

    /* R(j:end, :) = R(j:end,:) - 2 * vj * vj' * R(j:end,:), : */

    /* Q(:,j:end)  = Q(:,j:end) - 2 * Q(:,j:end) * vj * vj^T */

    for (k=0; k<n; k++){

	/* (v * v' * A)_ik = v_i SUM_m Conj(v_m) A_mk */

	z.re = 0.0; 
	z.im = 0.0; 

	for (m=j; m<n; m++){

	  z1 = c_con(vj[m-j]); 
	  z1 = c_mul(z1, R[m][k]); 	  
	  z  = c_add(z,z1); 
	}

	for (i=j; i<n; i++){ 

	  z2    = c_mul(vj[i-j],z);

	  /* Update R(i,k) */
	  
	  R[i][k].re = R[i][k].re - 2.0 * z2.re; 
	  R[i][k].im = R[i][k].im - 2.0 * z2.im; 

	  CheckValue(FUNCTION_NAME, "R[i][k].re","", R[i][k].re, -INFTY, INFTY);
	  CheckValue(FUNCTION_NAME, "R[i][k].im","", R[i][k].im, -INFTY, INFTY);

	} 

	/* (A * v * v^')_ki = v_i * SUM_m Conj(v_m) A_km */

	z.re = 0.0;
	z.im = 0.0;

	for (m=j; m<n; m++){

	  z1 = vj[m-j]; 
	  z1 = c_mul(z1, Q[k][m]);
	  z  = c_add(z,z1);
	}

	for (i=j; i<n; i++){

	  z1 = c_con(vj[i-j]); 
	  z2 = c_mul(z1,z);

	  /* Update Q(k,i)*/

	  Q[k][i].re = Q[k][i].re - 2.0 * z2.re; 
	  Q[k][i].im = Q[k][i].im - 2.0 * z2.im; 

	  CheckValue(FUNCTION_NAME, "Q[k][i].re","", Q[k][i].re, -INFTY, INFTY);
	  CheckValue(FUNCTION_NAME, "Q[k][i].im","", Q[k][i].im, -INFTY, INFTY);
	}	      
    }
  } /* End of main loop (j) */

  /* -------------------------------------------------------------*/

  /* Free auxiliary variables */

  Mem(MEM_FREE, vj); 
  return;   

    
  
}
예제 #26
0
int
ilu_ccopy_to_ucol(
	      int	 jcol,	   /* in */
	      int	 nseg,	   /* in */
	      int	 *segrep,  /* in */
	      int	 *repfnz,  /* in */
	      int	 *perm_r,  /* in */
	      complex	 *dense,   /* modified - reset to zero on return */
	      int  	 drop_rule,/* in */
	      milu_t	 milu,	   /* in */
	      double	 drop_tol, /* in */
	      int	 quota,    /* maximum nonzero entries allowed */
	      complex	 *sum,	   /* out - the sum of dropped entries */
	      int	 *nnzUj,   /* in - out */
	      GlobalLU_t *Glu,	   /* modified */
	      float	 *work	   /* working space with minimum size n,
				    * used by the second dropping rule */
	      )
{
/*
 * Gather from SPA dense[*] to global ucol[*].
 */
    int       ksub, krep, ksupno;
    int       i, k, kfnz, segsze;
    int       fsupc, isub, irow;
    int       jsupno, nextu;
    int       new_next, mem_error;
    int       *xsup, *supno;
    int       *lsub, *xlsub;
    complex    *ucol;
    int       *usub, *xusub;
    int       nzumax;
    int       m; /* number of entries in the nonzero U-segments */
    register float d_max = 0.0, d_min = 1.0 / dlamch_("Safe minimum");
    register double tmp;
    complex zero = {0.0, 0.0};
    int i_1 = 1;

    xsup    = Glu->xsup;
    supno   = Glu->supno;
    lsub    = Glu->lsub;
    xlsub   = Glu->xlsub;
    ucol    = Glu->ucol;
    usub    = Glu->usub;
    xusub   = Glu->xusub;
    nzumax  = Glu->nzumax;

    *sum = zero;
    if (drop_rule == NODROP) {
	drop_tol = -1.0, quota = Glu->n;
    }

    jsupno = supno[jcol];
    nextu  = xusub[jcol];
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ksub++) {
	krep = segrep[k--];
	ksupno = supno[krep];

	if ( ksupno != jsupno ) { /* Should go into ucol[] */
	    kfnz = repfnz[krep];
	    if ( kfnz != EMPTY ) {	/* Nonzero U-segment */

		fsupc = xsup[ksupno];
		isub = xlsub[fsupc] + kfnz - fsupc;
		segsze = krep - kfnz + 1;

		new_next = nextu + segsze;
		while ( new_next > nzumax ) {
		    if ((mem_error = cLUMemXpand(jcol, nextu, UCOL, &nzumax,
			    Glu)) != 0)
			return (mem_error);
		    ucol = Glu->ucol;
		    if ((mem_error = cLUMemXpand(jcol, nextu, USUB, &nzumax,
			    Glu)) != 0)
			return (mem_error);
		    usub = Glu->usub;
		    lsub = Glu->lsub;
		}

		for (i = 0; i < segsze; i++) {
		    irow = lsub[isub++];
         	    tmp = c_abs1(&dense[irow]);

		    /* first dropping rule */
		    if (quota > 0 && tmp >= drop_tol) {
			if (tmp > d_max) d_max = tmp;
			if (tmp < d_min) d_min = tmp;
			usub[nextu] = perm_r[irow];
			ucol[nextu] = dense[irow];
			nextu++;
		    } else {
			switch (milu) {
			    case SMILU_1:
			    case SMILU_2:
                                c_add(sum, sum, &dense[irow]);
				break;
			    case SMILU_3:
				/* *sum += fabs(dense[irow]);*/
				sum->r += tmp;
				break;
			    case SILU:
			    default:
				break;
			}
#ifdef DEBUG
			num_drop_U++;
#endif
		    }
		    dense[irow] = zero;
		}

	    }

	}

    } /* for each segment... */

    xusub[jcol + 1] = nextu;	  /* Close U[*,jcol] */
    m = xusub[jcol + 1] - xusub[jcol];

    /* second dropping rule */
    if (drop_rule & DROP_SECONDARY && m > quota) {
	register double tol = d_max;
	register int m0 = xusub[jcol] + m - 1;

	if (quota > 0) {
	    if (drop_rule & DROP_INTERP) {
		d_max = 1.0 / d_max; d_min = 1.0 / d_min;
		tol = 1.0 / (d_max + (d_min - d_max) * quota / m);
	    } else {
                i_1 = xusub[jcol];
                for (i = 0; i < m; ++i, ++i_1) work[i] = c_abs1(&ucol[i_1]);
		tol = sqselect(m, work, quota);
#if 0
		A = &ucol[xusub[jcol]];
		for (i = 0; i < m; i++) work[i] = i;
		qsort(work, m, sizeof(int), _compare_);
		tol = fabs(usub[xusub[jcol] + work[quota]]);
#endif
	    }
	}
	for (i = xusub[jcol]; i <= m0; ) {
	    if (c_abs1(&ucol[i]) <= tol) {
		switch (milu) {
		    case SMILU_1:
		    case SMILU_2:
			c_add(sum, sum, &ucol[i]);
			break;
		    case SMILU_3:
			sum->r += tmp;
			break;
		    case SILU:
		    default:
			break;
		}
		ucol[i] = ucol[m0];
		usub[i] = usub[m0];
		m0--;
		m--;
#ifdef DEBUG
		num_drop_U++;
#endif
		xusub[jcol + 1]--;
		continue;
	    }
	    i++;
	}
    }

    if (milu == SMILU_2) {
        sum->r = c_abs1(sum); sum->i = 0.0;
    }
    if (milu == SMILU_3) sum->i = 0.0;

    *nnzUj += m;

    return 0;
}
예제 #27
0
파일: csp_blas2.c 프로젝트: BranYang/scipy
/*! \brief Performs one of the matrix-vector operations y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y
 *
 * <pre>  
 *   Purpose   
 *   =======   
 *
 *   sp_cgemv()  performs one of the matrix-vector operations   
 *      y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
 *   where alpha and beta are scalars, x and y are vectors and A is a
 *   sparse A->nrow by A->ncol matrix.   
 *
 *   Parameters   
 *   ==========   
 *
 *   TRANS  - (input) char*
 *            On entry, TRANS specifies the operation to be performed as   
 *            follows:   
 *               TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
 *               TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
 *               TRANS = 'C' or 'c'   y := alpha*A^H*x + beta*y.   
 *
 *   ALPHA  - (input) complex
 *            On entry, ALPHA specifies the scalar alpha.   
 *
 *   A      - (input) SuperMatrix*
 *            Before entry, the leading m by n part of the array A must   
 *            contain the matrix of coefficients.   
 *
 *   X      - (input) complex*, array of DIMENSION at least   
 *            ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
 *           and at least   
 *            ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
 *            Before entry, the incremented array X must contain the   
 *            vector x.   
 * 
 *   INCX   - (input) int
 *            On entry, INCX specifies the increment for the elements of   
 *            X. INCX must not be zero.   
 *
 *   BETA   - (input) complex
 *            On entry, BETA specifies the scalar beta. When BETA is   
 *            supplied as zero then Y need not be set on input.   
 *
 *   Y      - (output) complex*,  array of DIMENSION at least   
 *            ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
 *            and at least   
 *            ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
 *            Before entry with BETA non-zero, the incremented array Y   
 *            must contain the vector y. On exit, Y is overwritten by the 
 *            updated vector y.
 *	      
 *   INCY   - (input) int
 *            On entry, INCY specifies the increment for the elements of   
 *            Y. INCY must not be zero.   
 *
 *    ==== Sparse Level 2 Blas routine.   
 * </pre>
*/
int
sp_cgemv(char *trans, complex alpha, SuperMatrix *A, complex *x, 
	 int incx, complex beta, complex *y, int incy)
{

    /* Local variables */
    NCformat *Astore;
    complex   *Aval;
    int info;
    complex temp, temp1;
    int lenx, leny, i, j, irow;
    int iy, jx, jy, kx, ky;
    int notran;
    complex comp_zero = {0.0, 0.0};
    complex comp_one = {1.0, 0.0};

    notran = ( strncmp(trans, "N", 1)==0 || strncmp(trans, "n", 1)==0 );
    Astore = A->Store;
    Aval = Astore->nzval;
    
    /* Test the input parameters */
    info = 0;
    if ( !notran && strncmp(trans, "T", 1)!=0 && strncmp(trans, "C", 1)!=0)
        info = 1;
    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
    else if (incx == 0) info = 5;
    else if (incy == 0)	info = 8;
    if (info != 0) {
	input_error("sp_cgemv ", &info);
	return 0;
    }

    /* Quick return if possible. */
    if (A->nrow == 0 || A->ncol == 0 || 
	c_eq(&alpha, &comp_zero) && 
	c_eq(&beta, &comp_one))
	return 0;

    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
       up the start points in  X  and  Y. */
    if ( notran ) {
	lenx = A->ncol;
	leny = A->nrow;
    } else {
	lenx = A->nrow;
	leny = A->ncol;
    }
    if (incx > 0) kx = 0;
    else kx =  - (lenx - 1) * incx;
    if (incy > 0) ky = 0;
    else ky =  - (leny - 1) * incy;

    /* Start the operations. In this version the elements of A are   
       accessed sequentially with one pass through A. */
    /* First form  y := beta*y. */
    if ( !c_eq(&beta, &comp_one) ) {
	if (incy == 1) {
	    if ( c_eq(&beta, &comp_zero) )
		for (i = 0; i < leny; ++i) y[i] = comp_zero;
	    else
		for (i = 0; i < leny; ++i) 
		  cc_mult(&y[i], &beta, &y[i]);
	} else {
	    iy = ky;
	    if ( c_eq(&beta, &comp_zero) )
		for (i = 0; i < leny; ++i) {
		    y[iy] = comp_zero;
		    iy += incy;
		}
	    else
		for (i = 0; i < leny; ++i) {
		    cc_mult(&y[iy], &beta, &y[iy]);
		    iy += incy;
		}
	}
    }
    
    if ( c_eq(&alpha, &comp_zero) ) return 0;

    if ( notran ) {
	/* Form  y := alpha*A*x + y. */
	jx = kx;
	if (incy == 1) {
	    for (j = 0; j < A->ncol; ++j) {
		if ( !c_eq(&x[jx], &comp_zero) ) {
		    cc_mult(&temp, &alpha, &x[jx]);
		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
			irow = Astore->rowind[i];
			cc_mult(&temp1, &temp,  &Aval[i]);
			c_add(&y[irow], &y[irow], &temp1);
		    }
		}
		jx += incx;
	    }
	} else {
	    ABORT("Not implemented.");
	}
    } else if (strncmp(trans, "T", 1) == 0 || strncmp(trans, "t", 1) == 0) {
	/* Form  y := alpha*A'*x + y. */
	jy = ky;
	if (incx == 1) {
	    for (j = 0; j < A->ncol; ++j) {
		temp = comp_zero;
		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
		    irow = Astore->rowind[i];
		    cc_mult(&temp1, &Aval[i], &x[irow]);
		    c_add(&temp, &temp, &temp1);
		}
		cc_mult(&temp1, &alpha, &temp);
		c_add(&y[jy], &y[jy], &temp1);
		jy += incy;
	    }
	} else {
	    ABORT("Not implemented.");
	}
    } else { /* trans == 'C' or 'c' */
	/* Form  y := alpha * conj(A) * x + y. */
	complex temp2;
	jy = ky;
	if (incx == 1) {
	    for (j = 0; j < A->ncol; ++j) {
		temp = comp_zero;
		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
		    irow = Astore->rowind[i];
		    temp2.r = Aval[i].r;
		    temp2.i = -Aval[i].i;  /* conjugation */
		    cc_mult(&temp1, &temp2, &x[irow]);
		    c_add(&temp, &temp, &temp1);
		}
		cc_mult(&temp1, &alpha, &temp);
		c_add(&y[jy], &y[jy], &temp1);
		jy += incy;
	    }
	} else {
	    ABORT("Not implemented.");
	}
    }

    return 0;    
} /* sp_cgemv */
예제 #28
0
int
pcgstrf_column_bmod(
		    const int  pnum,   /* process number */
		    const int  jcol,   /* current column in the panel */
		    const int  fpanelc,/* first column in the panel */
		    const int  nseg,   /* number of s-nodes to update jcol */
		    int        *segrep,/* in */
		    int        *repfnz,/* in */
		    complex     *dense, /* modified */
		    complex     *tempv, /* working array */
		    pxgstrf_shared_t *pxgstrf_shared, /* modified */
		    Gstat_t *Gstat     /* modified */
		    )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose:
 * ========
 *    Performs numeric block updates (sup-col) in topological order.
 *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
 *    Special processing on the supernodal portion of L\U[*,j].
 *
 * Return value:
 * =============
 *      0 - successful return
 *    > 0 - number of bytes allocated when run out of space
 *
 */
#if ( MACH==CRAY_PVP )
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    
#ifdef USE_VENDOR_BLAS    
    int         incx = 1, incy = 1;
    complex      alpha, beta;
#endif
    GlobalLU_t *Glu = pxgstrf_shared->Glu;   /* modified */
    
    /* krep = representative of current k-th supernode
     * fsupc = first supernodal column
     * nsupc = no of columns in supernode
     * nsupr = no of rows in supernode (used as leading dimension)
     * luptr = location of supernodal LU-block in storage
     * kfnz = first nonz in the k-th supernodal segment
     * no_zeros = no of leading zeros in a supernodal U-segment
     */
    complex	  ukj, ukj1, ukj2;
    register int lptr, kfnz, isub, irow, i, no_zeros;
    register int luptr, luptr1, luptr2;
    int          fsupc, nsupc, nsupr, segsze;
    int          nrow;	  /* No of rows in the matrix of matrix-vector */
    int          jsupno, k, ksub, krep, krep_ind, ksupno;
    int          ufirst, nextlu;
    int          fst_col; /* First column within small LU update */
    int          d_fsupc; /* Distance between the first column of the current
			     panel and the first column of the current snode.*/
    int          *xsup, *supno;
    int          *lsub, *xlsub, *xlsub_end;
    complex       *lusup;
    int          *xlusup, *xlusup_end;
    complex       *tempv1;
    int          mem_error;
    register float flopcnt;

    complex      zero = {0.0, 0.0};
    complex      one = {1.0, 0.0};
    complex      none = {-1.0, 0.0};
    complex      comp_temp, comp_temp1;

    xsup       = Glu->xsup;
    supno      = Glu->supno;
    lsub       = Glu->lsub;
    xlsub      = Glu->xlsub;
    xlsub_end  = Glu->xlsub_end;
    lusup      = Glu->lusup;
    xlusup     = Glu->xlusup;
    xlusup_end = Glu->xlusup_end;
    jsupno     = supno[jcol];

    /* 
     * For each nonz supernode segment of U[*,j] in topological order 
     */
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ksub++) {

	krep = segrep[k];
	k--;
	ksupno = supno[krep];
#if ( DEBUGlvel>=2 )
if (jcol==BADCOL)
printf("(%d) pcgstrf_column_bmod[1]: %d, nseg %d, krep %d, jsupno %d, ksupno %d\n",
       pnum, jcol, nseg, krep, jsupno, ksupno);
#endif    
	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */

	    fsupc = xsup[ksupno];
	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

  	    /* Distance from the current supernode to the current panel; 
	       d_fsupc=0 if fsupc >= fpanelc. */
  	    d_fsupc = fst_col - fsupc; 

	    luptr = xlusup[fst_col] + d_fsupc;
	    lptr = xlsub[fsupc] + d_fsupc;
	    kfnz = repfnz[krep];
	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );
	    segsze = krep - kfnz + 1;
	    nsupc = krep - fst_col + 1;
	    nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */
	    nrow = nsupr - d_fsupc - nsupc;
	    krep_ind = lptr + nsupc - 1;

	flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze;//sj
		Gstat->procstat[pnum].fcops += flopcnt;

#if ( DEBUGlevel>=2 )
if (jcol==BADCOL)	    
printf("(%d) pcgstrf_column_bmod[2]: %d, krep %d, kfnz %d, segsze %d, d_fsupc %d,\
fsupc %d, nsupr %d, nsupc %d\n",
       pnum, jcol, krep, kfnz, segsze, d_fsupc, fsupc, nsupr, nsupc);

#endif



            /*
             * Case 1: Update U-segment of size 1 -- col-col update
             */
            if ( segsze == 1 ) {
                ukj = dense[lsub[krep_ind]];
                luptr += nsupr*(nsupc-1) + nsupc;

                for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
                    irow = lsub[i];
                    cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                    c_sub(&dense[irow], &dense[irow], &comp_temp);
                    luptr++;
                }

            } else if ( segsze <= 3 ) {
                ukj = dense[lsub[krep_ind]];
                luptr += nsupr*(nsupc-1) + nsupc-1;
                ukj1 = dense[lsub[krep_ind - 1]];
                luptr1 = luptr - nsupr;

                if ( segsze == 2 ) { /* Case 2: 2cols-col update */
                    cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                    c_sub(&ukj, &ukj, &comp_temp);
                    dense[lsub[krep_ind]] = ukj;
                    for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
                        irow = lsub[i];
                        luptr++;
                        luptr1++;
                        cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                        cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                        c_add(&comp_temp, &comp_temp, &comp_temp1);
                        c_sub(&dense[irow], &dense[irow], &comp_temp);
                    }
                } else { /* Case 3: 3cols-col update */
                    ukj2 = dense[lsub[krep_ind - 2]];
                    luptr2 = luptr1 - nsupr;
                    cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
                    c_sub(&ukj1, &ukj1, &comp_temp);

                    cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                    cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                    c_add(&comp_temp, &comp_temp, &comp_temp1);
                    c_sub(&ukj, &ukj, &comp_temp);

                    dense[lsub[krep_ind]] = ukj;
                    dense[lsub[krep_ind-1]] = ukj1;
                    for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
                        irow = lsub[i];
                        luptr++;
                        luptr1++;
                        luptr2++;
                        cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                        cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                        c_add(&comp_temp, &comp_temp, &comp_temp1);
                        cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                        c_add(&comp_temp, &comp_temp, &comp_temp1);
                        c_sub(&dense[irow], &dense[irow], &comp_temp);
                    }
                }


	    } else {
	  	/*
		 * Case: sup-col update
		 * Perform a triangular solve and block update,
		 * then scatter the result of sup-col update to dense
		 */
		no_zeros = kfnz - fst_col;

	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
	        isub = lptr + no_zeros;
	        for (i = 0; i < segsze; i++) {
	  	    irow = lsub[isub];
		    tempv[i] = dense[irow];
		    ++isub; 
	        }

	        /* Dense triangular solve -- start effective triangle */
		luptr += nsupr * no_zeros + no_zeros; 
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
		CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#else
		ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#endif
		
 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
		alpha = one;
		beta = zero;
#if ( MACH==CRAY_PVP )
		CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#else
		cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#endif
#else
		clsolve ( nsupr, segsze, &lusup[luptr], tempv );

 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
		cmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
#endif
                /* Scatter tempv[] into SPA dense[*] */
                isub = lptr + no_zeros;
                for (i = 0; i < segsze; i++) {
                    irow = lsub[isub];
                    dense[irow] = tempv[i]; /* Scatter */
                    tempv[i] = zero;
                    isub++;
                }

		/* Scatter tempv1[] into SPA dense[*] */
		for (i = 0; i < nrow; i++) {
		    irow = lsub[isub];
                    c_sub(&dense[irow], &dense[irow], &tempv1[i]);
		    tempv1[i] = zero;
		    ++isub;
		}
	    } /* else segsze >= 4 */
	    
	} /* if jsupno ... */

    } /* for each segment... */

    
    /* ------------------------------------------
       Process the supernodal portion of L\U[*,j]
       ------------------------------------------ */
    
    fsupc = SUPER_FSUPC (jsupno);
    nsupr = xlsub_end[fsupc] - xlsub[fsupc];
    if ( (mem_error = Glu_alloc(pnum, jcol, nsupr, LUSUP, &nextlu, 
			       pxgstrf_shared)) )
	return mem_error;
    xlusup[jcol] = nextlu;
    lusup = Glu->lusup;
    
    /* Gather the nonzeros from SPA dense[*,j] into L\U[*,j] */
    for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; ++isub) {
  	irow = lsub[isub];
	lusup[nextlu] = dense[irow];
	dense[irow] = zero;
#ifdef DEBUG
if (jcol == -1)
    printf("(%d) pcgstrf_column_bmod[lusup] jcol %d, irow %d, lusup %.10e\n",
	   pnum, jcol, irow, lusup[nextlu]);
#endif	
	++nextlu;
    }
    xlusup_end[jcol] = nextlu; /* close L\U[*,jcol] */

#if ( DEBUGlevel>=2 )
if (jcol == -1) {
    nrow = xlusup_end[jcol] - xlusup[jcol];
    print_double_vec("before sup-col update", nrow, &lsub[xlsub[fsupc]],
		     &lusup[xlusup[jcol]]);
}
#endif    
    
    /*
     * For more updates within the panel (also within the current supernode), 
     * should start from the first column of the panel, or the first column 
     * of the supernode, whichever is bigger. There are 2 cases:
     *    (1) fsupc < fpanelc,  then fst_col := fpanelc
     *    (2) fsupc >= fpanelc, then fst_col := fsupc
     */
    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

    if ( fst_col < jcol ) {

  	/* distance between the current supernode and the current panel;
	   d_fsupc=0 if fsupc >= fpanelc. */
  	d_fsupc = fst_col - fsupc;

	lptr = xlsub[fsupc] + d_fsupc;
	luptr = xlusup[fst_col] + d_fsupc;
	nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */
	nsupc = jcol - fst_col;	/* Excluding jcol */
	nrow = nsupr - d_fsupc - nsupc;

	/* points to the beginning of jcol in supernode L\U[*,jsupno] */
	ufirst = xlusup[jcol] + d_fsupc;	

#if ( DEBUGlevel>=2 )
if (jcol==BADCOL)
printf("(%d) pcgstrf_column_bmod[3] jcol %d, fsupc %d, nsupr %d, nsupc %d, nrow %d\n",
       pnum, jcol, fsupc, nsupr, nsupc, nrow);
#endif    

	flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc; //sj
	Gstat->procstat[pnum].fcops += flopcnt;

/*	ops[TRSV] += nsupc * (nsupc - 1);
	ops[GEMV] += 2 * nrow * nsupc;    */
	
#ifdef USE_VENDOR_BLAS
	alpha = none; beta = one; /* y := beta*y + alpha*A*x */
#if ( MACH==CRAY_PVP )
	CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
	CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#else
	ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
	cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#endif
#else
	clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );

	cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
		 &lusup[ufirst], tempv );
	
        /* Copy updates from tempv[*] into lusup[*] */
	isub = ufirst + nsupc;
	for (i = 0; i < nrow; i++) {
            c_sub(&lusup[isub], &lusup[isub], &tempv[i]);
            tempv[i] = zero;
	    ++isub;
	}
#endif
    } /* if fst_col < jcol ... */ 

    return 0;
}
예제 #29
0
void
cpanel_bmod (
            const int  m,          /* in - number of rows in the matrix */
            const int  w,          /* in */
            const int  jcol,       /* in */
            const int  nseg,       /* in */
            complex     *dense,     /* out, of size n by w */
            complex     *tempv,     /* working array */
            int        *segrep,    /* in */
            int        *repfnz,    /* in, of size n by w */
            GlobalLU_t *Glu,       /* modified */
            SuperLUStat_t *stat    /* output */
            )
{


#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    int          incx = 1, incy = 1;
    complex       alpha, beta;
#endif

    register int k, ksub;
    int          fsupc, nsupc, nsupr, nrow;
    int          krep, krep_ind;
    complex       ukj, ukj1, ukj2;
    int          luptr, luptr1, luptr2;
    int          segsze;
    int          block_nrow;  /* no of rows in a block row */
    register int lptr;        /* Points to the row subscripts of a supernode */
    int          kfnz, irow, no_zeros;
    register int isub, isub1, i;
    register int jj;          /* Index through each column in the panel */
    int          *xsup, *supno;
    int          *lsub, *xlsub;
    complex       *lusup;
    int          *xlusup;
    int          *repfnz_col; /* repfnz[] for a column in the panel */
    complex       *dense_col;  /* dense[] for a column in the panel */
    complex       *tempv1;             /* Used in 1-D update */
    complex       *TriTmp, *MatvecTmp; /* used in 2-D update */
    complex      zero = {0.0, 0.0};
    complex      one = {1.0, 0.0};
    complex      comp_temp, comp_temp1;
    register int ldaTmp;
    register int r_ind, r_hi;
    static   int first = 1, maxsuper, rowblk, colblk;
    flops_t  *ops = stat->ops;

    xsup    = Glu->xsup;
    supno   = Glu->supno;
    lsub    = Glu->lsub;
    xlsub   = Glu->xlsub;
    lusup   = Glu->lusup;
    xlusup  = Glu->xlusup;

    if ( first ) {
        maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) );
        rowblk   = sp_ienv(4);
        colblk   = sp_ienv(5);
        first = 0;
    }
    ldaTmp = maxsuper + rowblk;

    /*
     * For each nonz supernode segment of U[*,j] in topological order
     */
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */

        /* krep = representative of current k-th supernode
         * fsupc = first supernodal column
         * nsupc = no of columns in a supernode
         * nsupr = no of rows in a supernode
         */
        krep = segrep[k--];
        fsupc = xsup[supno[krep]];
        nsupc = krep - fsupc + 1;
        nsupr = xlsub[fsupc+1] - xlsub[fsupc];
        nrow = nsupr - nsupc;
        lptr = xlsub[fsupc];
        krep_ind = lptr + nsupc - 1;

        repfnz_col = repfnz;
        dense_col = dense;

        if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */

            TriTmp = tempv;

            /* Sequence through each column in panel -- triangular solves */
            for (jj = jcol; jj < jcol + w; jj++,
                 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {

                kfnz = repfnz_col[krep];
                if ( kfnz == EMPTY ) continue;  /* Skip any zero segment */

                segsze = krep - kfnz + 1;
                luptr = xlusup[fsupc];

                ops[TRSV] += 4 * segsze * (segsze - 1);
                ops[GEMV] += 8 * nrow * segsze;

                /* Case 1: Update U-segment of size 1 -- col-col update */
                if ( segsze == 1 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    luptr += nsupr*(nsupc-1) + nsupc;

                    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
                        irow = lsub[i];
                        cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                        c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        ++luptr;
                    }

                } else if ( segsze <= 3 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    ukj1 = dense_col[lsub[krep_ind - 1]];
                    luptr += nsupr*(nsupc-1) + nsupc-1;
                    luptr1 = luptr - nsupr;

                    if ( segsze == 2 ) {
                        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        c_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            luptr++; luptr1++;
                            cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                            cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    } else {
                        ukj2 = dense_col[lsub[krep_ind - 2]];
                        luptr2 = luptr1 - nsupr;
                        cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
                        c_sub(&ukj1, &ukj1, &comp_temp);

                        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                        c_add(&comp_temp, &comp_temp, &comp_temp1);
                        c_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        dense_col[lsub[krep_ind-1]] = ukj1;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            luptr++; luptr1++; luptr2++;
                            cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                            cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    }

                } else  {       /* segsze >= 4 */

                    /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
                       holds the result of triangular solves.    */
                    no_zeros = kfnz - fsupc;
                    isub = lptr + no_zeros;
                    for (i = 0; i < segsze; ++i) {
                        irow = lsub[isub];
                        TriTmp[i] = dense_col[irow]; /* Gather */
                        ++isub;
                    }

                    /* start effective triangle */
                    luptr += nsupr * no_zeros + no_zeros;

#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
                    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
                           &nsupr, TriTmp, &incx );
#else
                    ctrsv_( "L", "N", "U", &segsze, &lusup[luptr],
                           &nsupr, TriTmp, &incx );
#endif
#else
                    clsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
#endif


                } /* else ... */

            }  /* for jj ... end tri-solves */

            /* Block row updates; push all the way into dense[*] block */
            for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {

                r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
                block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
                luptr = xlusup[fsupc] + nsupc + r_ind;
                isub1 = lptr + nsupc + r_ind;

                repfnz_col = repfnz;
                TriTmp = tempv;
                dense_col = dense;

                /* Sequence through each column in panel -- matrix-vector */
                for (jj = jcol; jj < jcol + w; jj++,
                     repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {

                    kfnz = repfnz_col[krep];
                    if ( kfnz == EMPTY ) continue; /* Skip any zero segment */

                    segsze = krep - kfnz + 1;
                    if ( segsze <= 3 ) continue;   /* skip unrolled cases */

                    /* Perform a block update, and scatter the result of
                       matrix-vector to dense[].                 */
                    no_zeros = kfnz - fsupc;
                    luptr1 = luptr + nsupr * no_zeros;
                    MatvecTmp = &TriTmp[maxsuper];

#ifdef USE_VENDOR_BLAS
                    alpha = one;
                    beta = zero;
#ifdef _CRAY
                    CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1],
                           &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
#else
                    cgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1],
                           &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
#endif
#else
                    cmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
                           TriTmp, MatvecTmp);
#endif

                    /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
                     * such that MatvecTmp[*] can be re-used for the
                     * the next blok row update. dense[] will be copied into
                     * global store after the whole panel has been finished.
                     */
                    isub = isub1;
                    for (i = 0; i < block_nrow; i++) {
                        irow = lsub[isub];
                        c_sub(&dense_col[irow], &dense_col[irow],
                              &MatvecTmp[i]);
                        MatvecTmp[i] = zero;
                        ++isub;
                    }

                } /* for jj ... */

            } /* for each block row ... */

            /* Scatter the triangular solves into SPA dense[*] */
            repfnz_col = repfnz;
            TriTmp = tempv;
            dense_col = dense;

            for (jj = jcol; jj < jcol + w; jj++,
                 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
                kfnz = repfnz_col[krep];
                if ( kfnz == EMPTY ) continue; /* Skip any zero segment */

                segsze = krep - kfnz + 1;
                if ( segsze <= 3 ) continue; /* skip unrolled cases */

                no_zeros = kfnz - fsupc;
                isub = lptr + no_zeros;
                for (i = 0; i < segsze; i++) {
                    irow = lsub[isub];
                    dense_col[irow] = TriTmp[i];
                    TriTmp[i] = zero;
                    ++isub;
                }

            } /* for jj ... */

        } else { /* 1-D block modification */


            /* Sequence through each column in the panel */
            for (jj = jcol; jj < jcol + w; jj++,
                 repfnz_col += m, dense_col += m) {

                kfnz = repfnz_col[krep];
                if ( kfnz == EMPTY ) continue;  /* Skip any zero segment */

                segsze = krep - kfnz + 1;
                luptr = xlusup[fsupc];

                ops[TRSV] += 4 * segsze * (segsze - 1);
                ops[GEMV] += 8 * nrow * segsze;

                /* Case 1: Update U-segment of size 1 -- col-col update */
                if ( segsze == 1 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    luptr += nsupr*(nsupc-1) + nsupc;

                    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
                        irow = lsub[i];
                        cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                        c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        ++luptr;
                    }

                } else if ( segsze <= 3 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    luptr += nsupr*(nsupc-1) + nsupc-1;
                    ukj1 = dense_col[lsub[krep_ind - 1]];
                    luptr1 = luptr - nsupr;

                    if ( segsze == 2 ) {
                        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        c_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            ++luptr;  ++luptr1;
                            cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                            cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    } else {
                        ukj2 = dense_col[lsub[krep_ind - 2]];
                        luptr2 = luptr1 - nsupr;
                        cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
                        c_sub(&ukj1, &ukj1, &comp_temp);

                        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                        c_add(&comp_temp, &comp_temp, &comp_temp1);
                        c_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        dense_col[lsub[krep_ind-1]] = ukj1;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            ++luptr; ++luptr1; ++luptr2;
                            cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                            cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    }

                } else  { /* segsze >= 4 */
                    /*
                     * Perform a triangular solve and block update,
                     * then scatter the result of sup-col update to dense[].
                     */
                    no_zeros = kfnz - fsupc;

                    /* Copy U[*,j] segment from dense[*] to tempv[*]:
                     *    The result of triangular solve is in tempv[*];
                     *    The result of matrix vector update is in dense_col[*]
                     */
                    isub = lptr + no_zeros;
                    for (i = 0; i < segsze; ++i) {
                        irow = lsub[isub];
                        tempv[i] = dense_col[irow]; /* Gather */
                        ++isub;
                    }

                    /* start effective triangle */
                    luptr += nsupr * no_zeros + no_zeros;

#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
                    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
                           &nsupr, tempv, &incx );
#else
                    ctrsv_( "L", "N", "U", &segsze, &lusup[luptr],
                           &nsupr, tempv, &incx );
#endif

                    luptr += segsze;    /* Dense matrix-vector */
                    tempv1 = &tempv[segsze];
                    alpha = one;
                    beta = zero;
#ifdef _CRAY
                    CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
                           &nsupr, tempv, &incx, &beta, tempv1, &incy );
#else
                    cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
                           &nsupr, tempv, &incx, &beta, tempv1, &incy );
#endif
#else
                    clsolve ( nsupr, segsze, &lusup[luptr], tempv );

                    luptr += segsze;        /* Dense matrix-vector */
                    tempv1 = &tempv[segsze];
                    cmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
#endif

                    /* Scatter tempv[*] into SPA dense[*] temporarily, such
                     * that tempv[*] can be used for the triangular solve of
                     * the next column of the panel. They will be copied into
                     * ucol[*] after the whole panel has been finished.
                     */
                    isub = lptr + no_zeros;
                    for (i = 0; i < segsze; i++) {
                        irow = lsub[isub];
                        dense_col[irow] = tempv[i];
                        tempv[i] = zero;
                        isub++;
                    }

                    /* Scatter the update from tempv1[*] into SPA dense[*] */
                    /* Start dense rectangular L */
                    for (i = 0; i < nrow; i++) {
                        irow = lsub[isub];
                        c_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]);
                        tempv1[i] = zero;
                        ++isub;
                    }

                } /* else segsze>=4 ... */

            } /* for each column in the panel... */

        } /* else 1-D update ... */

    } /* for each updating supernode ... */

}
예제 #30
0
int
ilu_cpivotL(
	const int  jcol,     /* in */
	const double u,      /* in - diagonal pivoting threshold */
	int	   *usepr,   /* re-use the pivot sequence given by
			      * perm_r/iperm_r */
	int	   *perm_r,  /* may be modified */
	int	   diagind,  /* diagonal of Pc*A*Pc' */
	int	   *swap,    /* in/out record the row permutation */
	int	   *iswap,   /* in/out inverse of swap, it is the same as
				perm_r after the factorization */
	int	   *marker,  /* in */
	int	   *pivrow,  /* in/out, as an input if *usepr!=0 */
	double	   fill_tol, /* in - fill tolerance of current column
			      * used for a singular column */
	milu_t	   milu,     /* in */
	complex	   drop_sum, /* in - computed in ilu_ccopy_to_ucol()
                                (MILU only) */
	GlobalLU_t *Glu,     /* modified - global LU data structures */
	SuperLUStat_t *stat  /* output */
       )
{

    int		 n;	 /* number of columns */
    int		 fsupc;  /* first column in the supernode */
    int		 nsupc;  /* no of columns in the supernode */
    int		 nsupr;  /* no of rows in the supernode */
    int		 lptr;	 /* points to the starting subscript of the supernode */
    register int	 pivptr;
    int		 old_pivptr, diag, ptr0;
    register float  pivmax, rtemp;
    float	 thresh;
    complex	 temp;
    complex	 *lu_sup_ptr;
    complex	 *lu_col_ptr;
    int		 *lsub_ptr;
    register int	 isub, icol, k, itemp;
    int		 *lsub, *xlsub;
    complex	 *lusup;
    int		 *xlusup;
    flops_t	 *ops = stat->ops;
    int		 info;
    complex one = {1.0, 0.0};

    /* Initialize pointers */
    n	       = Glu->n;
    lsub       = Glu->lsub;
    xlsub      = Glu->xlsub;
    lusup      = Glu->lusup;
    xlusup     = Glu->xlusup;
    fsupc      = (Glu->xsup)[(Glu->supno)[jcol]];
    nsupc      = jcol - fsupc;		/* excluding jcol; nsupc >= 0 */
    lptr       = xlsub[fsupc];
    nsupr      = xlsub[fsupc+1] - lptr;
    lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */
    lu_col_ptr = &lusup[xlusup[jcol]];	/* start of jcol in the supernode */
    lsub_ptr   = &lsub[lptr];	/* start of row indices of the supernode */

    /* Determine the largest abs numerical value for partial pivoting;
       Also search for user-specified pivot, and diagonal element. */
    pivmax = -1.0;
    pivptr = nsupc;
    diag = EMPTY;
    old_pivptr = nsupc;
    ptr0 = EMPTY;
    for (isub = nsupc; isub < nsupr; ++isub) {
        if (marker[lsub_ptr[isub]] > jcol)
            continue; /* do not overlap with a later relaxed supernode */

	switch (milu) {
	    case SMILU_1:
                c_add(&temp, &lu_col_ptr[isub], &drop_sum);
		rtemp = c_abs1(&temp);
		break;
	    case SMILU_2:
	    case SMILU_3:
                /* In this case, drop_sum contains the sum of the abs. value */
		rtemp = c_abs1(&lu_col_ptr[isub]);
		break;
	    case SILU:
	    default:
		rtemp = c_abs1(&lu_col_ptr[isub]);
		break;
	}
	if (rtemp > pivmax) { pivmax = rtemp; pivptr = isub; }
	if (*usepr && lsub_ptr[isub] == *pivrow) old_pivptr = isub;
	if (lsub_ptr[isub] == diagind) diag = isub;
	if (ptr0 == EMPTY) ptr0 = isub;
    }

    if (milu == SMILU_2 || milu == SMILU_3) pivmax += drop_sum.r;

    /* Test for singularity */
    if (pivmax < 0.0) {
	fprintf(stderr, "[0]: jcol=%d, SINGULAR!!!\n", jcol);
	fflush(stderr);
	exit(1);
    }
    if ( pivmax == 0.0 ) {
	if (diag != EMPTY)
	    *pivrow = lsub_ptr[pivptr = diag];
	else if (ptr0 != EMPTY)
	    *pivrow = lsub_ptr[pivptr = ptr0];
	else {
	    /* look for the first row which does not
	       belong to any later supernodes */
	    for (icol = jcol; icol < n; icol++)
		if (marker[swap[icol]] <= jcol) break;
	    if (icol >= n) {
		fprintf(stderr, "[1]: jcol=%d, SINGULAR!!!\n", jcol);
		fflush(stderr);
		exit(1);
	    }

	    *pivrow = swap[icol];

	    /* pick up the pivot row */
	    for (isub = nsupc; isub < nsupr; ++isub)
		if ( lsub_ptr[isub] == *pivrow ) { pivptr = isub; break; }
	}
	pivmax = fill_tol;
	lu_col_ptr[pivptr].r = pivmax;
	lu_col_ptr[pivptr].i = 0.0;
	*usepr = 0;
#ifdef DEBUG
	printf("[0] ZERO PIVOT: FILL (%d, %d).\n", *pivrow, jcol);
	fflush(stdout);
#endif
	info =jcol + 1;
    } /* if (*pivrow == 0.0) */
    else {
	thresh = u * pivmax;

	/* Choose appropriate pivotal element by our policy. */
	if ( *usepr ) {
	    switch (milu) {
		case SMILU_1:
                    c_add(&temp, &lu_col_ptr[old_pivptr], &drop_sum);
		    rtemp = c_abs1(&temp);
		    break;
		case SMILU_2:
		case SMILU_3:
		    rtemp = c_abs1(&lu_col_ptr[old_pivptr]) + drop_sum.r;
		    break;
		case SILU:
		default:
		    rtemp = c_abs1(&lu_col_ptr[old_pivptr]);
		    break;
	    }
	    if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = old_pivptr;
	    else *usepr = 0;
	}
	if ( *usepr == 0 ) {
	    /* Use diagonal pivot? */
	    if ( diag >= 0 ) { /* diagonal exists */
		switch (milu) {
		    case SMILU_1:
                        c_add(&temp, &lu_col_ptr[diag], &drop_sum);
         	        rtemp = c_abs1(&temp);
			break;
		    case SMILU_2:
		    case SMILU_3:
			rtemp = c_abs1(&lu_col_ptr[diag]) + drop_sum.r;
			break;
		    case SILU:
		    default:
			rtemp = c_abs1(&lu_col_ptr[diag]);
			break;
		}
		if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag;
	    }
	    *pivrow = lsub_ptr[pivptr];
	}
	info = 0;

	/* Reset the diagonal */
	switch (milu) {
	    case SMILU_1:
		c_add(&lu_col_ptr[pivptr], &lu_col_ptr[pivptr], &drop_sum);
		break;
	    case SMILU_2:
	    case SMILU_3:
                temp = c_sgn(&lu_col_ptr[pivptr]);
                cc_mult(&temp, &temp, &drop_sum);
                c_add(&lu_col_ptr[pivptr], &lu_col_ptr[pivptr], &drop_sum);
		break;
	    case SILU:
	    default:
		break;
	}

    } /* else */

    /* Record pivot row */
    perm_r[*pivrow] = jcol;
    if (jcol < n - 1) {
	register int t1, t2, t;
	t1 = iswap[*pivrow]; t2 = jcol;
	if (t1 != t2) {
	    t = swap[t1]; swap[t1] = swap[t2]; swap[t2] = t;
	    t1 = swap[t1]; t2 = t;
	    t = iswap[t1]; iswap[t1] = iswap[t2]; iswap[t2] = t;
	}
    } /* if (jcol < n - 1) */

    /* Interchange row subscripts */
    if ( pivptr != nsupc ) {
	itemp = lsub_ptr[pivptr];
	lsub_ptr[pivptr] = lsub_ptr[nsupc];
	lsub_ptr[nsupc] = itemp;

	/* Interchange numerical values as well, for the whole snode, such 
	 * that L is indexed the same way as A.
	 */
	for (icol = 0; icol <= nsupc; icol++) {
	    itemp = pivptr + icol * nsupr;
	    temp = lu_sup_ptr[itemp];
	    lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr];
	    lu_sup_ptr[nsupc + icol*nsupr] = temp;
	}
    } /* if */

    /* cdiv operation */
    ops[FACT] += 10 * (nsupr - nsupc);
    c_div(&temp, &one, &lu_col_ptr[nsupc]);
    for (k = nsupc+1; k < nsupr; k++) 
	cc_mult(&lu_col_ptr[k], &lu_col_ptr[k], &temp);

    return info;
}