Пример #1
0
// find the optimal superposition transform (translation, rotation, translation)
// return rmsd resulting from it
// both pointsets are unchanged
float findSuperpositionTransform( vector<vector<float> > & from, vector<vector<float> > & onto, vector<float> & t1, vector<vector<float> > & rot, vector<float> & t2 ) {
    if(from.size() != onto.size()) { cout << "superpose called with unequal pointsets" << endl; exit(1); }
    // find center of mass and translate the pointsets so that they are at origin
    vector<float> CMfrom(3), CMonto(3);
    for(int i=0; i < 3; i++) { CMfrom[i] = 0; CMonto[i] = 0.; }
    for(int pi=0; pi < from.size(); pi++)
        for(int i=0; i < 3; i++) {
            CMfrom[i] += from[pi][i];
            CMonto[i] += onto[pi][i];
        }
    for(int i=0; i < 3; i++) { CMfrom[i] /= from.size(); CMonto[i] /= from.size(); }
    // find distance matrix
    vector<float> q(4,0), m(3,0), p(3,0); // minus and plus
    vector<vector<float> > Q(4,q); // quaternion
    //for(int qi=0; qi < 4; qi++) cout << "Q " << Q[qi][0] <<" "<< Q[qi][1] <<" "<< Q[qi][2] <<" "<< Q[qi][3] << endl;
    float xm, ym, zm, xp, yp, zp;
    for(int pi=0; pi < from.size(); pi++) {
        xm = from[pi][0]-CMfrom[0] - onto[pi][0]+CMonto[0]; xp = onto[pi][0]-CMonto[0] + from[pi][0]-CMfrom[0];
        ym = from[pi][1]-CMfrom[1] - onto[pi][1]+CMonto[1]; yp = onto[pi][1]-CMonto[1] + from[pi][1]-CMfrom[1];
        zm = from[pi][2]-CMfrom[2] - onto[pi][2]+CMonto[2]; zp = onto[pi][2]-CMonto[2] + from[pi][2]-CMfrom[2];
        Q[0][0] += xm*xm + ym*ym + zm*zm;
        Q[1][1] += yp*yp + zp*zp + xm*xm;
        Q[2][2] += xp*xp + zp*zp + ym*ym;
        Q[3][3] += xp*xp + yp*yp + zm*zm;
        Q[0][1] += yp*zm - ym*zp;
        Q[0][2] += xm*zp - xp*zm;
        Q[0][3] += xp*ym - xm*yp;
        Q[1][2] += xm*ym - xp*yp;
        Q[1][3] += xm*zm - xp*zp;
        Q[2][3] += ym*zm - yp*zp;
    }
    Q[1][0]=Q[0][1];
    Q[2][0]=Q[0][2];
    Q[2][1]=Q[1][2];
    Q[3][0]=Q[0][3];
    Q[3][1]=Q[1][3];
    Q[3][2]=Q[2][3];

    //for(int qi=0; qi < 4; qi++) cout << "Q " << Q[qi][0] <<" "<< Q[qi][1] <<" "<< Q[qi][2] <<" "<< Q[qi][3] << endl;

    vector<float> eigvals(4,0);
    vector<vector<float> > eigvecs(4,eigvals);
    if(Jacobi(Q, eigvals, eigvecs)) { cout << "Jacobi did not converge" << endl; exit(1); }
    //for(int ei=0; ei < 4; ei++) cout <<"EIG "<< eigvals[ei] <<" : "<< eigvecs[0][ei] <<" "<< eigvecs[1][ei] <<" "<< eigvecs[2][ei] <<" "<< eigvecs[3][ei] << endl;

    float rmsd;
    if(eigvals[3] < 0) rmsd = 0;
    else rmsd = sqrt(eigvals[3]/from.size());

    // construct rotation matrix from eigenvector with lowest eigenvalue
    float q1 = eigvecs[0][3], q2 = eigvecs[1][3], q3 = eigvecs[2][3], q4 = eigvecs[3][3];
    float q1sqr=q1*q1, q2sqr=q2*q2, q3sqr=q3*q3, q4sqr=q4*q4;
    float q1q2=q1*q2, q1q3=q1*q3, q1q4=q1*q4;
    float q2q3=q2*q3, q2q4=q2*q4, q3q4=q3*q4;

    rot[0][0] = q1sqr + q2sqr - q3sqr -q4sqr;
    rot[0][1] = 2.0 * (q2q3 - q1q4);
    rot[0][2] = 2.0 * (q2q4 + q1q3);

    rot[1][0] = 2.0 * (q2q3 + q1q4);
    rot[1][1] = q1sqr + q3sqr - q2sqr - q4sqr;
    rot[1][2] = 2.0 * (q3q4 - q1q2);

    rot[2][0] = 2.0 * (q2q4 - q1q3);
    rot[2][1] = 2.0 * (q3q4 + q1q2);
    rot[2][2] = q1sqr + q4sqr -q2sqr - q3sqr;

    for(int i=0; i < 3; i++) t1[i] = -1 * CMfrom[i];
    for(int i=0; i < 3; i++) t2[i] = CMonto[i];

    return rmsd;
}
Пример #2
0
int main(int argc, char **argv)
{

  char **eglist ;
  int numeg ;
  int i, j, k, pos; 
  int *vv ;
  SNP *cupt, *cupt2 ;
  Indiv *indx ;
  double y1, y2, y ;

  int n0, n1, nkill ;

  int nindiv = 0 ;
  int nignore, numrisks = 1 ;
  SNP **xsnplist  ;
  Indiv **xindlist ;
  int *xindex ;
  int nrows, ncols, m ;
  double *XTX, *cc, *evecs, *ww ;
  double *lambda ;
  double *tvecs ;
  int weightmode = NO ;
  int t ;
  double *xmean, *xfancy ;
  double *ldmat = NULL, *ldmat2 = NULL;
  double *ldvv = NULL, *ldvv2 = NULL, *vv2 = NULL ;
  int chrom,  numclear ;
  double gdis ;
  int outliter, numoutiter, *badlist, nbad ;
  int a, b, n ;
  FILE *outlfile ;
  

  int xblock, blocksize=10000 ;   
  double *tblock ;  

  OUTLINFO *outpt ;
  int *idperm, *vecind ;   // for sort

  readcommands(argc, argv) ;
  printf("## smartrel version: %s\n", WVERSION) ;
  packmode = YES ;
  setomode(&outputmode, omode) ;

  if (parname == NULL) return 0 ;
  if (xchrom == (numchrom+1)) noxdata = NO ;

  if (fstonly) { 
   printf("fstonly\n") ;
   numeigs = 0 ; 
   numoutliter = 0 ;
   numoutiter = 0 ;
   outputname = NULL ;
   snpeigname = NULL ;
  }

  if (fancynorm) printf("norm used\n\n") ;
  else printf("no norm used\n\n") ;

  nostatslim = MAX(nostatslim, 3) ;

  outlfile = ofile = stdout; 

  if (outputname != NULL)  openit(outputname, &ofile, "w") ;
  if (outliername != NULL) openit(outliername, &outlfile, "w") ;
  if (fstdetailsname != NULL) openit(fstdetailsname, &fstdetails, "w") ;

  numsnps = 
    getsnps(snpname, &snpmarkers, 0.0, badsnpname, &nignore, numrisks) ;

  numindivs = getindivs(indivname, &indivmarkers) ;
  k = getgenos(genotypename, snpmarkers, indivmarkers, 
    numsnps, numindivs, nignore) ;


  if (poplistname != NULL) 
  { 
    ZALLOC(eglist, numindivs, char *) ; 
    numeg = loadlist(eglist, poplistname) ;
    seteglist(indivmarkers, numindivs, poplistname);
  }
  else
  {
    setstatus(indivmarkers, numindivs, NULL) ;
    ZALLOC(eglist, MAXPOPS, char *) ;
    numeg = makeeglist(eglist, MAXPOPS, indivmarkers, numindivs) ;
  }
  for (i=0; i<numeg; i++) 
  {  
    /* printf("%3d %s\n",i, eglist[i]) ; */
  }

  nindiv=0 ;
  for (i=0; i<numindivs; i++) 
  {
    indx = indivmarkers[i] ;
    if(indx -> affstatus == YES) ++nindiv  ;
  }

  for (i=0; i<numsnps; i++)  
  {  
    cupt = snpmarkers[i] ; 
    chrom = cupt -> chrom ;
    if ((noxdata) && (chrom == (numchrom+1))) cupt-> ignore = YES ;
    if (chrom == 0) cupt -> ignore = YES ;
    if (chrom > (numchrom+1)) cupt -> ignore = YES ;
  }
  for (i=0; i<numsnps; i++)  
  {
    cupt = snpmarkers[i] ; 
    pos = nnint(cupt -> physpos) ;
    if ((xchrom>0) && (cupt -> chrom != xchrom)) cupt -> ignore = YES ;
    if ((xchrom > 0) && (pos < lopos)) cupt -> ignore = YES ;
    if ((xchrom > 0) && (pos > hipos)) cupt -> ignore = YES ;
    if (cupt -> ignore) continue ;
    if (numvalidgtx(indivmarkers, cupt, YES) <= 1) 
    { 
      printf("nodata: %20s\n", cupt -> ID) ;
      cupt -> ignore = YES ;
    }
  }

  if (killr2) {
   nkill = killhir2(snpmarkers, numsnps, numindivs, r2physlim, r2genlim, r2thresh) ;
   if (nkill>0) printf("killhir2.  number of snps killed: %d\n", nkill) ;
  }

  ZALLOC(vv, numindivs, int) ;
  numvalidgtallind(vv, snpmarkers, numsnps,  numindivs) ; 
  for (i=0; i<numindivs; ++i)  { 
  if (vv[i] == 0) {
    indx = indivmarkers[i] ;
    indx -> ignore = YES ; 
   }
  }
  free(vv) ;

  numsnps = rmsnps(snpmarkers, numsnps, NULL) ;  //  rid ignorable snps

   
  if (missingmode) 
  {
    setmiss(snpmarkers, numsnps) ;
    fancynorm = NO ;
  }

  if  (weightname != NULL)   
  {  
    weightmode = YES ;
    getweights(weightname, snpmarkers, numsnps) ;
  }
  if (ldregress>0) 
  {  
    ZALLOC(ldvv,  ldregress*numindivs, double) ;
    ZALLOC(ldvv2,  ldregress*numindivs, double) ;
    ZALLOC(vv2,  numindivs, double) ;
    ZALLOC(ldmat,  ldregress*ldregress, double) ;
    ZALLOC(ldmat2,  ldregress*ldregress, double) ;
    setidmat(ldmat, ldregress) ;         
    vst(ldmat, ldmat, 1.0e-6, ldregress*ldregress) ;
  }

  ZALLOC(xindex, numindivs, int) ;
  ZALLOC(xindlist, numindivs, Indiv *) ;
  ZALLOC(xsnplist, numsnps, SNP *) ;

  if (popsizelimit > 0) 
  {  
    setplimit(indivmarkers, numindivs, eglist, numeg, popsizelimit) ; 
  }

  nrows = loadindx(xindlist, xindex, indivmarkers, numindivs) ;
  ncols = loadsnpx(xsnplist, snpmarkers, numsnps, indivmarkers) ;
  printf("number of samples used: %d number of snps used: %d\n", nrows, ncols) ;

/**
  cupt = xsnplist[0] ;
  for (j=0; j<nrows; ++j) {  
   k = xindex[j] ;
   g = getgtypes(cupt, k) ;
   indx = indivmarkers[k] ;
   t = indxindex(eglist, numeg, indx -> egroup) ;
   printf("yy1 %20s %20s %20s %d %d %d\n", cupt ->ID, indx -> ID, indx -> egroup, j, k, g) ;
  }
  printf("yya: ") ; printimat(xindex, 1, nrows) ;
  printf("zzindxa:  %s\n", indivmarkers[230] -> egroup) ;
*/

  /* printf("## nrows: %d  ncols  %d\n", nrows, ncols) ; */
  ZALLOC(xmean, ncols, double) ;
  ZALLOC(xfancy, ncols, double) ;
  ZALLOC(XTX, nrows*nrows, double) ;
  ZALLOC(evecs, nrows*nrows, double) ;
  ZALLOC(tvecs, nrows*nrows, double) ;
  ZALLOC(lambda, nrows, double) ;
  ZALLOC(cc, nrows, double) ;
  ZALLOC(ww, nrows, double) ;
  ZALLOC(badlist, nrows, int) ;

  blocksize = MIN(blocksize, ncols) ; 
  ZALLOC(tblock, nrows*blocksize, double) ;

  // xfancy is multiplier for column xmean is mean to take off
  // badlist is list of rows to delete (outlier removal) 

  numoutiter = 1 ;  

  if (numoutliter>=1) 
  {
    numoutiter = numoutliter+1 ;
    ZALLOC(outinfo, nrows,  OUTLINFO *) ;  
    for (k=0; k<nrows; k++) 
    {  
      ZALLOC(outinfo[k], 1, OUTLINFO) ;
    }
    /* fprintf(outlfile, "##%18s %4s %6s %9s\n", "ID", "iter","eigvec", "score") ; */
  }

  for (outliter = 1; outliter <= numoutiter ; ++outliter)  {
    if (fstonly) { 
     setidmat(XTX, nrows) ;
     vclear(lambda, 1.0, nrows) ;
     break ;
    }
    if (outliter>1) {
     ncols = loadsnpx(xsnplist, snpmarkers, numsnps, indivmarkers) ;
    }
    vzero(XTX, nrows*nrows) ;
    vzero(tblock, nrows*blocksize) ;
    xblock = 0 ; 

    vzero(xmean, ncols) ;
    vclear(xfancy, 1.0, ncols) ;

    for (i=0; i<ncols; i++) 
    { 
      cupt = xsnplist[i] ;
      chrom = cupt -> chrom ;
      getcolxz(cc, cupt, xindex, nrows, i, xmean, xfancy, &n0, &n1) ;
      t = MIN(n0, n1) ; 

      if (t <= minallelecnt)  {  
       cupt -> ignore = YES ;
       vzero(cc, nrows) ; 
      }

      if (weightmode) 
      {
        vst(cc, cc, xsnplist[i] -> weight, nrows) ;
      }
      if (ldregress>0) 
      {  
        numclear = 0 ;
        for (k=1; k<= ldregress; ++k)  
        {  
          j = i-k ;  
          if (j<0) 
          { 
            numclear = ldregress-k+1 ; 
            break ;
          }
          cupt2 = xsnplist[j] ;  
          if (cupt2 -> chrom != chrom) gdis = ldlimit + 1.0 ; 
          else gdis = cupt -> genpos - cupt2 -> genpos ;
          if (gdis>=ldlimit) 
          {   
            numclear = ldregress-k+1 ; 
            break ;
          }
        }
        if (numclear>0) clearld(ldmat, ldvv, ldregress, nrows, numclear) ; 
        ldreg(ldmat, ldmat2, cc, vv2, ldvv, ldvv2, ldregress, nrows) ;
        copyarr(ldmat2, ldmat, ldregress*ldregress) ;
        copyarr(vv2, cc, nrows) ;
        copyarr(ldvv2, ldvv, ldregress*nrows) ;
      }
      copyarr(cc, tblock+xblock*nrows, nrows) ;
      ++xblock ; 

/** this is the key code to parallelize */
      if (xblock==blocksize) 
      {  
        domult(tvecs, tblock, xblock, nrows) ;
        vvp(XTX, XTX, tvecs, nrows*nrows) ;
        xblock = 0 ;
        vzero(tblock, nrows*blocksize) ;
      }
    }

    if (xblock>0) 
    { 
     domult(tvecs, tblock, xblock, nrows) ;
     vvp(XTX, XTX, tvecs, nrows*nrows) ;
    }
    symit(XTX, nrows) ;

    /**
    a = 0; b=0 ;
    printf("zz1 %12.6f ", XTX[a*nrows+b]) ;
    a = nrows-1; b=nrows-1 ;
    printf(" %12.6f %15.9g\n", XTX[a*nrows+b], asum(XTX, nrows*nrows)) ;
    */

    if (verbose) 
    {
      printdiag(XTX, nrows) ;
    }

    y = trace(XTX, nrows) / (double) (nrows-1) ;
    if (isnan(y)) fatalx("bad XTX matrix\n") ;
    /* printf("trace:  %9.3f\n", y) ; */
    if (y<=0.0) fatalx("XTX has zero trace (perhaps no data)\n") ;
    vst(XTX, XTX, 1.0/y, nrows * nrows) ;
/// mean eigenvalue is 1
    eigvecs(XTX, lambda, evecs, nrows) ;
// eigenvalues are in decreasing order 

    if (outliter > numoutliter) break ;  
    // last pass skips outliers 
    numoutleigs = MIN(numoutleigs, nrows-1) ;
    nbad = ridoutlier(evecs, nrows, numoutleigs, outlthresh, badlist, outinfo) ;
    if (nbad == 0) break ; 
    for (i=0; i<nbad; i++) 
    {  
      j = badlist[i] ;
      indx = xindlist[j] ;
      outpt = outinfo[j] ;
      fprintf(outlfile, "REMOVED outlier %s iter %d evec %d sigmage %.3f\n", indx -> ID, outliter, outpt -> vecno, outpt -> score) ;
      indx -> ignore = YES ;
    }
    nrows = loadindx(xindlist, xindex, indivmarkers, numindivs) ;
    printf("number of samples after outlier removal: %d\n", nrows) ;
  }

  if (outliername != NULL) fclose(outlfile) ;

  m = numgtz(lambda, nrows)  ;
  /* printf("matrix rank: %d\n", m) ; */
  if (m==0) fatalx("no data\n") ;

/** smartrel code */
  for (i=0; i<numeigs; i++) {  
   y = sqrt(lambda[i]) ;
   vst(ww, evecs+i*nrows, y, nrows) ;               
   subouter(XTX, ww, nrows) ;
  }
  free(tvecs) ; 

  n = 0 ;
  ZALLOC(vecind, nrows*nrows/2, int) ; 
  for (i=0; i<nrows; i++) { 
   for (j=i+1; j<nrows; j++) { 
    k = i*nrows + j ; 
    y1 = XTX[i*nrows+i] ;
    y2 = XTX[j*nrows+j] ;
    y = XTX[k]/sqrt(y1*y2) ;
    y += 1/(double)(nrows-1);
    if (y<relthresh) continue ;
    vecind[n] = k ; 
    evecs[n] = -y ;
    ++n ;
   }
  }
  free(XTX) ; 
  if (n==0) { 
   printf("## nothing above relthresh!\n") ;
   printf("##end of smartrel run\n") ;
   return 0 ;
  }
  ZALLOC(idperm, n, int) ; 
  sortit(evecs, idperm, n) ;
  for (i=0; i<n; i++) {  
   j = idperm[i] ;
   k = vecind[j] ;
   a = k/nrows ; 
   b = k%nrows ;
   printf("rel: %20s ",  xindlist[a] ->ID) ;
   printf("%20s ",  xindlist[b] ->ID) ;
   printf(" %9.3f", -evecs[i]) ;
   printnl() ;
  }
  
  printf("##end of smartrel run\n") ;
  return 0 ;
}
Пример #3
0
Файл: pca.c Проект: b1234561/EIG
int main(int argc, char **argv)
{
  int k, n, m, nn, rowvalid, *outlier, i;
  int iter, nonewoutliers, nflags;
  char Xchar;
  double *X, *XTX, rowsum, rowmean = 0, rowmeanbayes = 0, *sigmaoutlier;
  double *eval, *evec, sum, summ, sum1, mean, sdev, sigma;
  FILE *fp, *fpout, *fplog, *fpeval;
  char *INFILE = NULL;
  char *OUTFILE = NULL;
  char *EVALFILE = NULL;
  char *LOGFILE = NULL;

  /* set default values */
  K=10; MAXITER=5; TOPK=10; SIGMATHRESH=6.0;

  /* process flags */
  nflags = 0;
  while((i = getopt(argc,argv,"i:k:o:e:l:m:t:s:")) != -1)
  {
    switch(i)
    {
      case 'i':          /* input file */
        INFILE = (char *) strdup(optarg);
        nflags++; break;
      case 'k':
        K = atoi(optarg); /* number of principal components to output */
        break;
      case 'o':          /* output file */
        OUTFILE = (char *) strdup(optarg);
        nflags++; break;
      case 'e':          /* output eval file */
        EVALFILE = (char *) strdup(optarg);
        nflags++; break;
      case 'l':          /* log file */
        LOGFILE = (char *) strdup(optarg);
        nflags++; break;
      case 'm':
        MAXITER = atoi(optarg); /* max # of outlier removal iterations */
        break;
      case 't':
        TOPK = atoi(optarg); /* # of PCs along which to remove outliers */
        break;
      case 's':
        SIGMATHRESH = atof(optarg); /* # sdev to declare as outlier */
        break;
    }
  }
  if(nflags != 4) 
  { 
    fprintf(stderr,"Usage: -i -o -e -l flags must all be specified\n");
    exit(1);
  }

  /* open output files */
  if( (fpout = fopen(OUTFILE, "w")) == NULL)
  {
    fprintf(stderr,"Could not open output file %s\n", OUTFILE);  exit(1);
  }
  if( (fpeval = fopen(EVALFILE, "w")) == NULL)
  {
    fprintf(stderr,"Could not open output file %s\n", OUTFILE);  exit(1);
  }
  if( (fplog = fopen(LOGFILE, "w")) == NULL)
  {
    fprintf(stderr,"Could not open input file %s\n", LOGFILE);  exit(1);
  }

  /* print parameters */
  fprintf(fplog,"pca program run using parameters\n");
  fprintf(fplog," -i %s\n",INFILE);
  fprintf(fplog," -k %d\n",K);
  fprintf(fplog," -o %s\n",OUTFILE);
  fprintf(fplog," -e %s\n",EVALFILE);
  fprintf(fplog," -l %s\n",LOGFILE);
  fprintf(fplog," -m %d\n",MAXITER);
  fprintf(fplog," -t %d\n",TOPK);
  fprintf(fplog," -s %.03f\n",SIGMATHRESH);
  fprintf(fplog,"\n");

  /* Determine NSAMPLES */
  if( (fp = fopen(INFILE, "r")) == NULL)
  {
    fprintf(stderr,"Could not open input file %s\n", INFILE);  exit(1);
  }
  n = 0;
  while(1)
  {
    fscanf(fp,"%c",&Xchar);
    if(Xchar == '\n') break;
    n++;
  }
  NSAMPLES = n;
  fclose(fp);
  if(K > NSAMPLES-1)
  {
    fprintf(stderr,"OOPS k=%d is too large for only %d samples\n",K,NSAMPLES);
    fprintf(fplog,"OOPS k=%d is too large for only %d samples\n",K,NSAMPLES);
    exit(1);
  }

  /* malloc */
  if((eval = (double *) malloc(NSAMPLES*sizeof(*eval))) == NULL)
    { fprintf(stderr,"CM\n");  exit(1); }
  if((evec = (double *) malloc(NSAMPLES*NSAMPLES*sizeof(*evec))) == NULL)
    { fprintf(stderr,"CM\n");  exit(1); }
  if((outlier = (int *) malloc(NSAMPLES*sizeof(*outlier))) == NULL)
    { fprintf(stderr,"CM\n");  exit(1); }
  if((sigmaoutlier = (double *) malloc(NSAMPLES*sizeof(*sigmaoutlier))) == NULL)
    { fprintf(stderr,"CM\n");  exit(1); }
  if((X = (double *) malloc(NSAMPLES*sizeof(*X))) == NULL)
    { fprintf(stderr,"CM\n");  exit(1); }
  if((XTX = (double *) malloc(NSAMPLES*NSAMPLES*sizeof(*X))) == NULL)
    { fprintf(stderr,"CM\n");  exit(1); }

  nonewoutliers = 0;
  for(n=0; n<NSAMPLES; n++) outlier[n] = 0;
  iter = 0;
  while(nonewoutliers == 0)
  {
    for(n=0; n<NSAMPLES; n++) sigmaoutlier[n] = 0.0;
    /* initialize XTX */
    for(n=0; n<NSAMPLES; n++)
    {
      for(nn=0; nn<NSAMPLES; nn++) 
        XTX[NSAMPLES*n+nn] = 0.0;
    }

    nonewoutliers = 1;
    /* get data */
    if( (fp = fopen(INFILE, "r")) == NULL)
    {
      fprintf(stderr,"Could not open input file %s\n", INFILE);  exit(1);
    }
    m = 0;
    while(1) /* do EVERYTHING for SNP m */
    {
      for(n=0; n<NSAMPLES; n++)
      {
        fscanf(fp,"%c",&Xchar);
        if(Xchar == '0') { X[n] = 0.0; }
        else if(Xchar == '1') { X[n] = 0.5; }
        else if(Xchar == '2') { X[n] = 1.0; }
        else if(Xchar == '9') { X[n] = -100.0; }
        else if(!(feof(fp)))
	{
          fprintf(stderr,"OOPS bad char %c at m=%d n=%d\n",Xchar,m,n);
          fprintf(fplog,"OOPS bad char %c at m=%d n=%d\n",Xchar,m,n);
          exit(1);
	}
        if(outlier[n] == 1) X[n] = -100.0;
      }
      if(feof(fp)) break;
      fscanf(fp,"%c",&Xchar); /* should be \n character */

      /* mean-adjust this SNP */
      rowvalid = 0;
      rowsum = 0.0;
      for(n=0; n<NSAMPLES; n++)
      {
        if(X[n] >= -99.0)
        {
          rowvalid++;
          rowsum += X[n];
        }
      }
      if(rowvalid > 0) 
      {
        rowmean = (rowsum)/((double)(rowvalid));
        rowmeanbayes = (rowsum+0.5)/((double)(1+rowvalid));
      }
      for(n=0; n<NSAMPLES; n++)
      {
        if(X[n] >= -99.0)
        {
	  X[n] -= rowmean;
  	  X[n] /= sqrt(rowmeanbayes*(1.0-rowmeanbayes));
        }
        else
          X[n] = 0.0;
      }
  
      /* update XTX */
      for(n=0; n<NSAMPLES; n++)
      {
        for(nn=n; nn<NSAMPLES; nn++) 
          XTX[NSAMPLES*n+nn] += X[n]*X[nn];
      }
      m++;
    }
    nSNP = m;
    if(K > nSNP-1)
    {
      fprintf(stderr,"OOPS k=%d is too large for only %d SNPs\n",K,nSNP);
      fprintf(fplog,"OOPS k=%d is too large for only %d SNPs\n",K,nSNP);
      exit(1);
    }
    if(iter == 0)
    {
      fprintf(fplog,"nSNP=%d NSAMPLES=%d\n",nSNP,NSAMPLES);
    }

    /* complete XTX */
    for(n=0; n<NSAMPLES; n++)
    {
      for(nn=n; nn<NSAMPLES; nn++) 
        XTX[NSAMPLES*n+nn] /= ((double)nSNP);
    }
    for(n=0; n<NSAMPLES; n++)
    {
      for(nn=0; nn<n; nn++) 
        XTX[NSAMPLES*n+nn] = XTX[NSAMPLES*nn+n];
    }

    /* do eigenanalysis */
    eigvecs(XTX, eval, evec, NSAMPLES); /* eigenvector k is evec[k*NSAMPLES+n] */

    if(iter == MAXITER) break; /* no need to look for outliers */

    /* find outliers */
    for(k=0; k<TOPK; k++)
    {
      sum=0.0; summ=0.0; sum1=0.0;
      for(n=0; n<NSAMPLES; n++)
      {
        if(outlier[n] == 1) continue;
        sum += evec[k*NSAMPLES+n];
        summ += evec[k*NSAMPLES+n]*evec[k*NSAMPLES+n];
        sum1 += 1.0;
      }
      mean = sum/sum1;
      sdev = sqrt(summ/sum1 - mean*mean);
      for(n=0; n<NSAMPLES; n++)
      {
        if(outlier[n] == 1) continue;
        sigma = (evec[k*NSAMPLES+n]-mean)/sdev;
        if(sigma < 0) sigma = -sigma;
        if(sigma > SIGMATHRESH) 
        { 
          if(sigma > sigmaoutlier[n]) sigmaoutlier[n] = sigma;
          nonewoutliers = 0; 
        }
      }
    }
    fprintf(fplog,"Outlier removal iteration %d:\n",iter);
    if(nonewoutliers) fprintf(fplog,"  no outliers detected\n");
    for(n=0; n<NSAMPLES; n++)
    {
      if(sigmaoutlier[n] > 0.0) 
      {
        fprintf(fplog,"  removed outlier individual %d (%.02f sigma)\n",n,sigmaoutlier[n]);
        outlier[n] = 1;
      }
    }
    iter++;
    fclose(fp);
  }

  /* print eval and evec */
  for(k=0; k<NSAMPLES; k++) fprintf(fpeval,"%.06f\n",eval[k]);
  fprintf(fpout,"%d\n",K);
  for(k=0; k<K; k++) fprintf(fpout,"%.04f\n",eval[k]);
  for(n=0; n<NSAMPLES; n++)
  {
    for(k=0; k<K; k++)
    {
      fprintf(fpout," ");
      if(evec[k*NSAMPLES+n] > 0) fprintf(fpout," ");
      fprintf(fpout,"%.04f",evec[k*NSAMPLES+n]);
    }
    fprintf(fpout,"\n");
  }
  return 0;
}