Exemplo n.º 1
0
void SplitFasta(const char* seqDataFile, const char *outpath, const char *ext)/*{{{*/
{
	char seqFile[MAX_PATH+1] = "";
	//char command[MAX_COMMAND_LINE+1] = "";
	
	
	FILE *fpSeqData;
	FILE *fpSeq;
	fpSeqData = fopen(seqDataFile,"r");
    checkfilestream(fpSeqData, seqDataFile, "r");

	char id[SIZE_ID+1] = "";
	char str[SIZE_ID+10+1] = "";
    char rtname[MAX_PATH+1] = "";
    rootname(seqDataFile,rtname);
    int linesize;
    Array1D <char> line_1darray(maxline+1);
    char *line = line_1darray.array1D; 
    char delim[] = " \t\r\n>";
    char *pch = NULL;

	fpos_t pos;
    //int i;
	int cnt = 0 ;
	while((linesize = fgetline(fpSeqData,line,maxline)) != EOF)
	{
        if(linesize == 0 || IsBlankLine(line)) continue;

		if(line[0] == '>')//beginning of a record
		{
			cnt ++ ;
            my_strcpy(str, line, SIZE_ID+10);
            pch = strtok(str,delim);
            while(pch != NULL) //take the first word following '>' as id
            {
                my_strcpy(id, pch, SIZE_ID);
                break;
            }
            if(strcmp(id,"") == 0)// if no word following '>', set the id as rootname(seq-file)_cnt
            {
                sprintf(id,"%s_%d", rtname, cnt);
            }

			printf("%d\t:%s exported\n",cnt,id);
			sprintf(seqFile,"%s/%s.%s",outpath,id,ext);
			fpSeq = fopen(seqFile,"w");
            checkfilestream(fpSeq, seqFile, "r");
			fprintf(fpSeq,"%s\n",line);
			while(!feof(fpSeqData))
			{
				fgetpos(fpSeqData,&pos);
				linesize = fgetline(fpSeqData,line,maxline);
                if(linesize == 0 || IsBlankLine(line)) continue;
				if(line[0] == '>')
				{
					fsetpos(fpSeqData,&pos);
					break;
				}
				else
					fprintf(fpSeq,"%s\n",line);

			}
			fclose(fpSeq);
		}
	}
	fclose(fpSeqData);
	printf("%d sequences exported!\n",cnt);
}
Exemplo n.º 2
0
/** Read examples into memory
 */
void CSeqFeature::LoadFeatures()
{ 
        unsigned int tmpFidx = 0;
        Scalar tmpFval = 0;
        unsigned int featureCnt = 0;
        unsigned int seqNum = 0;
        unsigned int phiNum = 0;
        unsigned int posNum1 = 0, posNum2 = 0;
        std::string line = "";
        std::string token = "";
        std::ifstream featureFp;
   
        featureFp.open(featureFile.c_str());   
        if(!featureFp.good()) 
        {
                string msg = "Cannot open feature file <" + featureFile + ">!";
                throw CBMRMException(msg, "CSeqFeature::ScanFeatureFile()");
        }
   
        // read header information
        int headerInfoCnt = 3; // min duration, max duration, feature dimension
        do {
                getline(featureFp, line);
                trim(line);
                if(IsBlankLine(line)) continue;  // blank line
                if(line[0] == '#') continue;  // comment line
                if(sscanf(line.c_str(),"maxDuration:%d",&maxDuration)==1) headerInfoCnt--;
                if(sscanf(line.c_str(),"minDuration:%d",&minDuration)==1) headerInfoCnt--;
                if(sscanf(line.c_str(),"globalFeatureDim:%d",&featureDimension)==1) headerInfoCnt--;
        } while(!featureFp.eof() && (headerInfoCnt != 0));
        
        assert(maxDuration >= minDuration);
        assert(featureDimension < (1<<30));  // featureDimension is normally less then 1 billion
                
        if(featureFp.eof())
                throw CBMRMException("Feature file does not contain valid examples","CSeqFeature::LoadFeatures()");
        
        // read sequences
        nnz = 0;
        while(!featureFp.eof()) 
        {
                // read sequence number
                do {
                        getline(featureFp, line);
                        trim(line);
                        if(IsBlankLine(line)) continue;  // blank line
                        if(line[0] == '#') continue;  // comment line
                        if(sscanf(line.c_str(),"sequence:%d",&seqNum)==1) break;
                } while(!featureFp.eof());
                
                if(featureFp.eof())
                        throw CBMRMException("Feature file does not contain valid phi:*","CSeqFeature::LoadFeatures()");
                
                
                // read phi:1 tag
                phiNum = 0;
                do {
                        getline(featureFp, line);
                        trim(line);
                        if(IsBlankLine(line)) continue;  // blank line
                        if(line[0] == '#') continue;  // comment line
                        if(sscanf(line.c_str(),"phi:%d",&phiNum)==1) break;
                } while(!featureFp.eof());
                
                if(featureFp.eof() || (phiNum != 1))
                        throw CBMRMException("Feature file does not contain valid phi:1","CSeqFeature::LoadFeatures()");
                
                // read phi:1 sparse vectors
                do {
                        getline(featureFp, line);
                        trim(line);
                        if(IsBlankLine(line)) continue;  // blank line
                        if(line[0] == '#') continue;  // comment line
                        
                        if(sscanf(line.c_str(),"phi:%d",&phiNum) == 1)
                                break;
                        
                        istringstream iss(line);
                        iss >> token;
                        if((sscanf(token.c_str(),"pos:%d",&posNum1) != 1))
                                throw CBMRMException("Feature file does not contain valid pos tag in phi:1","CSeqFeature::LoadFeatures()");
                        
                        TheMatrix svec(1,featureDimension,SML::SPARSE);
                        featureCnt = 0;
                        while(!iss.eof())
                        {
                                iss >> token;
                                if(sscanf(token.c_str(),svec_feature_index_and_value_format.c_str(),&tmpFidx, &tmpFval) != 2)
                                {
                                        ostringstream msg;
                                        msg << "Invalid #" << featureCnt + 1 << " sparse vector element in phi:"<< phiNum << " seq:" << seqNum << " pos:" << posNum1;
                                        throw CBMRMException(msg.str(),"CSeqFeature::LoadFeatures()");
                                }
                                svec.Set(0,tmpFidx,tmpFval);       
                                nnz++;
                        }
                        
                        if(featureCnt == 0)
                                throw CBMRMException("Feature file does not contain valid phi:2 sparse vector","CSeqFeature::LoadFeatures()");
                        
                        phi_1.push_back(svec);
                } while(!featureFp.eof());
                
                if(phi_1.size() < 1)
                        throw CBMRMException("Feature file does not contain valid phi:1","CSeqFeature::LoadFeatures()");
                
                numOfSeq = phi_1.size();
                
                if(featureFp.eof() || (phiNum != 2))
                        throw CBMRMException("Feature file does not contain valid phi:2","CSeqFeature::LoadFeatures()");
                
                // read phi:2 sparse vectors
                unsigned int prevPosNum1 = 0, prevPosNum2 = 0; 
                vector<TheMatrix> tmp_phi_2_svecs;
                featureCnt = 0;
                do {
                        getline(featureFp, line);
                        trim(line);
                        if(IsBlankLine(line)) continue;  // blank line
                        if(line[0] == '#') continue;  // comment line
                        
                        if((sscanf(line.c_str(),"phi:%d",&phiNum) == 1))
                                break;
                        
                        istringstream iss(line);
                        iss >> token;
                        if((sscanf(token.c_str(),"pos:%d,%d",&posNum1,&posNum2) != 2))
                                throw CBMRMException("Feature file does not containt valid pos tag in phi:2","CSeqFeature::LoadFeatures()");
                        
                        if(prevPosNum2 >= posNum2)
                        {
                                ostringstream msg;
                                msg << "previous posNum2 must be > current posNum2 in phi:2 (phi:2 pos:" << posNum1 << "," << posNum2;
                                throw CBMRMException(msg.str(),"CSeqFeature::LoadFeatures()");
                        }
                        
                        if(prevPosNum1 >= posNum1)
                        {
                                ostringstream msg;
                                msg << "previous posNum1 must be > current posNum1 in phi:2 (phi:2 pos:" << posNum1 << "," << posNum2;
                                throw CBMRMException(msg.str(),"CSeqFeature::LoadFeatures()");
                        }
                        
                        if(posNum1 != prevPosNum1)
                        {
                                phi_2.push_back(tmp_phi_2_svecs);
                                tmp_phi_2_svecs.clear();                                
                        }
                        
                        TheMatrix svec(1,featureDimension,SML::SPARSE);
                        featureCnt = 0;
                        while(!iss.eof())
                        {
                                iss >> token;
                                if(sscanf(token.c_str(),svec_feature_index_and_value_format.c_str(),&tmpFidx, &tmpFval) != 2)
                                {
                                        ostringstream msg;
                                        msg << "Invalid #" << featureCnt + 1 << " sparse vector element in phi:"<< phiNum << " seq:" << seqNum << " pos:" << posNum1;
                                        throw CBMRMException(msg.str(),"CSeqFeature::LoadFeatures()");
                                }
                                svec.Set(0,tmpFidx,tmpFval);    
                                nnz++;
                        }
                        
                        if(featureCnt == 0)
                                throw CBMRMException("Feature file does not containt valid phi:2 sparse vector","CSeqFeature::LoadFeatures()");
                        
                        tmp_phi_2_svecs.push_back(svec);

                } while(!featureFp.eof());
                
                if(phi_2.size() < 1)
                        throw CBMRMException("Feature file does not contain phi:2","CSeqFeature::LoadFeatures()");
        }
        
        // data matrix density
        density = ((double)nnz/featureDimension)/numOfSeq;
   
        featureFp.close();
}