void multiTxtDocument::loadFile ( string fileName, bool caseOn,  bool noPunct, bool debugMode, bool noTxtIds, bool tercomLike )
    {
        if ( multiTxtDocumentParams.debugMode )
        {
            cerr << "DEBUG tercpp : multiTxtDocument::loadFile : loading files  " << endl << fileName << endl << "END DEBUG" << endl;
            cerr << "DEBUG tercpp : multiTxtDocument::loadFile : testing params  " << endl << Tools::printParams ( multiTxtDocumentParams ) << endl << "END DEBUG" << endl;
            cerr << "DEBUG tercpp : multiTxtDocument::loadFile : testing others params  " << endl << "caseOn : " << caseOn << endl << "noPunct : " << noPunct << endl << "debugMode : " << debugMode << endl << "noTxtIds : " << noTxtIds << endl << "tercomLike : " << tercomLike << endl << "END DEBUG" << endl;
        }

        ifstream fichierLoad ( fileName.c_str(), ios::in );
        string line="";
        documentStructure l_doc;
	stringstream l_stream;
	l_doc.setFileName(fileName);
	l_stream.str ( "" );
	l_stream << ( int ) documents.size();
	l_doc.setDocId ( l_stream.str() );
        if ( fichierLoad )
        {
            int l_ids = 1;
	    l_stream.str ( "" );
	    string l_key="";
	    string line_mod="";
            while ( getline ( fichierLoad, line ) )
            {
		l_key="";
		line_mod="";
                l_stream.str ( "" );

                if ( noTxtIds )
                {
                    l_stream << l_ids;
                    l_key = l_stream.str();
                    line_mod = line;
                    l_ids++;
                }
                else
                {
		    if ((int)line.rfind ( "(" )==-1)
		    {
			cerr << "ERROR : multiTxtDocument::loadFile : Id not found, maybe you should use the --noTxtIds Option ? " << endl;
			exit ( 0 );
		    }
                    l_key = line.substr ( line.rfind ( "(" ), line.size() - 1 );
                    line_mod = line.substr ( 0, line.rfind ( "(" ) - 1 );
                }
                if ( multiTxtDocumentParams.debugMode )
                {
                    cerr << "DEBUG multiTxtDocument::loadFile : line NOT tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
                }
                if ( !tercomLike )
                {
                    if ( multiTxtDocumentParams.debugMode )
                    {
                        cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "TERCOM AT FALSE " << endl << "END DEBUG" << endl;
                    }

//                    line_mod = tokenizePunct ( line_mod );
                }
                if ( !caseOn )
                {
                    if ( multiTxtDocumentParams.debugMode )
                    {
                        cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "CASEON AT FALSE " << endl << "END DEBUG" << endl;
                    }
                    line_mod = lowerCase ( line_mod );
                }
                if ( noPunct )
                {
                    if ( multiTxtDocumentParams.debugMode )
                    {
                        cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "NOPUNCT AT TRUE " << endl << "END DEBUG" << endl;
                    }
                    if ( !tercomLike )
                    {
                        line_mod = removePunctTercom ( line_mod );
                    }
                    else
                    {
                        line_mod = removePunct ( line_mod );
                    }
                }
                if ( multiTxtDocumentParams.debugMode )
                {
                    cerr << "DEBUG multiTxtDocument::loadFile : line tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
                }
                vector<string> vecDocLine = stringToVector ( line_mod, " " );
// 	  string l_key;
// 	  hashHypothesis.addValue(l_key,vecDocLine);
// 	  l_key=(string)vecDocLine.at((int)vecDocLine.size()-1);
// 	  vecDocLine.pop_back();
                if ( multiTxtDocumentParams.debugMode )
                {
                    cerr << "DEBUG tercpp multiTxtDocument::loadFile : " << l_key << "|" << vectorToString ( vecDocLine ) << "|" << endl << "Vector Size : " << vecDocLine.size() << endl << "Line length : " << ( int ) line_mod.length() << endl << "END DEBUG" << endl;
                }
//             hashHypothesis.addValue(l_key,vecDocLine);
                segmentStructure l_seg ( l_key, vecDocLine, l_doc.getDocId() );
                l_doc.addSegments ( l_seg );
		 
            }
	    
//         Ref=line;
//         getline ( fichierHyp, line );
//         Hyp=line;
            fichierLoad.close();  // on ferme le fichier
	    addDocument ( l_doc );
            if ( multiTxtDocumentParams.debugMode )
            {
                cerr << "DEBUG multiTxtDocument::loadFile : document " << l_doc.getDocId() << " added !!!" << endl << "END DEBUG" << endl;
            }
	    l_key.erase();
	    line_mod.erase();
            l_stream.str("");
        }
        else  // sinon
        {
            cerr << "ERROR : multiTxtDocument::loadFile : can't open file : " + fileName + " !" << endl;
            exit ( 0 );
        }
    }
Esempio n. 2
0
    void xmlStructure::copy_to_SGMLDocument ( SGMLDocument* sgmlDoc, TiXmlNode* pParent, unsigned int indent )
    {
        if ( !pParent )
            return;

        TiXmlNode* pChild;
        TiXmlText* pText;
        int t = pParent->Type();
//         printf ( "%s", getIndent ( indent ) );
//         int num;
        string elementValue;
	bool unknownTag=false;
	if ( xmlParams.debugMode )
        {
            cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << " TiXmlNode: " << t << endl << "END DEBUG" << endl;
            cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << " indent: " << indent << endl << "END DEBUG" << endl;
        }
        switch ( t )
        {
        case TiXmlNode::DOCUMENT:

            if ( xmlParams.debugMode )
            {
                cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << " Document" << endl << "END DEBUG" << endl;
            }//                 printf ( "Document" );
            break;

        case TiXmlNode::ELEMENT:
//                 printf ( "Element [%s]", pParent->Value() );
            elementValue = Tools::lowerCase ( pParent->Value() );

            if ( xmlParams.debugMode )
            {
                cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << " elementValue: " << elementValue << endl << "END DEBUG" << endl;
            }

            if ( ( ( int ) elementValue.compare ( "refset" ) == 0 ) || ( ( int ) elementValue.compare ( "tstset" ) == 0 ) )
            {
                sgmlDoc->setDocType ( elementValue );
            }
            else
                if (( ( int ) elementValue.compare ( "doc" ) == 0 ) || ( ( int ) elementValue.compare ( "DOC" ) == 0 ))
                {
                    documentStructure tmp_doc;
                    sgmlDoc->addDocument ( tmp_doc );
                }
                else
                    if ( ( ( int ) elementValue.compare ( "seg" ) == 0 ) || ( ( int ) elementValue.compare ( "SEG" ) == 0 ) )
                    {
                        segmentStructure tmp_seg;
                        ( sgmlDoc->getLastDocument() )->addSegments ( tmp_seg );
                    }
		    else
		    {
			unknownTag=true;
		    }

            if ( xmlParams.debugMode )
            {
                cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << " Calling dump_attribs_to_SGMLDocuments with indent :" << indent + 1 << endl << "END DEBUG" << endl;
            }
            if (!unknownTag)
	    {
		dump_attribs_to_SGMLDocuments ( sgmlDoc, pParent->ToElement(), indent + 1 );
	    }
//                 num = dump_attribs_to_stdout ( pParent->ToElement(), indent + 1 );
//                 switch ( num )
//                 {
//                     case 0:
//                         printf ( " (No attributes)" );
//                         break;
//                     case 1:
//                         printf ( "%s1 attribute", getIndentAlt ( indent ) );
//                         break;
//                     default:
//                         printf ( "%s%d attributes", getIndentAlt ( indent ), num );
//                         break;
//                 }
            break;

//             case TiXmlNode::COMMENT:
//                 printf ( "Comment: [%s]", pParent->Value() );
//                 break;
//
//             case TiXmlNode::UNKNOWN:
//                 printf ( "Unknown" );
//                 break;

        case TiXmlNode::TEXT:
            pText = pParent->ToText();
//                 printf ( "Text: [%s]", pText->Value() );
            if ( indent >= 2 )
            {
                documentStructure * l_tmp_doc = sgmlDoc->getLastDocument();
                segmentStructure * l_tmp_seg = l_tmp_doc->getLastSegments();
                string l_text = pText->Value();
                string line_mod = l_text;
                if ( !xmlParams.tercomLike )
                {
                    if ( xmlParams.debugMode )
                    {
                        cerr << "DEBUG xmlStructure::copy_to_SGMLDocument : line NOT tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
                    }
                    if ( xmlParams.debugMode )
                    {
                        cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "TERCOM AT FALSE " << endl << "END DEBUG" << endl;
                    }

                    line_mod = tokenizePunct ( line_mod );
                }
                if ( !xmlParams.caseOn )
                {
                    if ( xmlParams.debugMode )
                    {
                        cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "CASEON AT FALSE " << endl << "END DEBUG" << endl;
                    }
                    line_mod = lowerCase ( line_mod );
                }
                if ( xmlParams.noPunct )
                {
                    if ( xmlParams.debugMode )
                    {
                        cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "NOPUNCT AT TRUE " << endl << "END DEBUG" << endl;
                    }
                    if ( !xmlParams.tercomLike )
                    {
                        line_mod = removePunctTercom ( line_mod );
                    }
                    else
                    {
                        line_mod = removePunct ( line_mod );
                    }
                }
                if ( xmlParams.debugMode )
                {
                    cerr << "DEBUG xmlStructure::copy_to_SGMLDocument : line tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
                }
                l_tmp_seg->addContent ( line_mod );
            }
            break;

//             case TiXmlNode::DECLARATION:
//                 printf ( "Declaration" );
//                 break;
        default:
            cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "Default TiXmlNode: " << t << endl << "END DEBUG" << endl;
            break;
        }
//         printf ( "\n" );
        for ( pChild = pParent->FirstChild(); pChild != 0; pChild = pChild->NextSibling() )
        {
            copy_to_SGMLDocument ( sgmlDoc, pChild, indent + 1 );
        }
    }