void multiTxtDocument::loadFile ( string fileName, bool caseOn, bool noPunct, bool debugMode, bool noTxtIds, bool tercomLike ) { if ( multiTxtDocumentParams.debugMode ) { cerr << "DEBUG tercpp : multiTxtDocument::loadFile : loading files " << endl << fileName << endl << "END DEBUG" << endl; cerr << "DEBUG tercpp : multiTxtDocument::loadFile : testing params " << endl << Tools::printParams ( multiTxtDocumentParams ) << endl << "END DEBUG" << endl; cerr << "DEBUG tercpp : multiTxtDocument::loadFile : testing others params " << endl << "caseOn : " << caseOn << endl << "noPunct : " << noPunct << endl << "debugMode : " << debugMode << endl << "noTxtIds : " << noTxtIds << endl << "tercomLike : " << tercomLike << endl << "END DEBUG" << endl; } ifstream fichierLoad ( fileName.c_str(), ios::in ); string line=""; documentStructure l_doc; stringstream l_stream; l_doc.setFileName(fileName); l_stream.str ( "" ); l_stream << ( int ) documents.size(); l_doc.setDocId ( l_stream.str() ); if ( fichierLoad ) { int l_ids = 1; l_stream.str ( "" ); string l_key=""; string line_mod=""; while ( getline ( fichierLoad, line ) ) { l_key=""; line_mod=""; l_stream.str ( "" ); if ( noTxtIds ) { l_stream << l_ids; l_key = l_stream.str(); line_mod = line; l_ids++; } else { if ((int)line.rfind ( "(" )==-1) { cerr << "ERROR : multiTxtDocument::loadFile : Id not found, maybe you should use the --noTxtIds Option ? " << endl; exit ( 0 ); } l_key = line.substr ( line.rfind ( "(" ), line.size() - 1 ); line_mod = line.substr ( 0, line.rfind ( "(" ) - 1 ); } if ( multiTxtDocumentParams.debugMode ) { cerr << "DEBUG multiTxtDocument::loadFile : line NOT tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl; } if ( !tercomLike ) { if ( multiTxtDocumentParams.debugMode ) { cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "TERCOM AT FALSE " << endl << "END DEBUG" << endl; } // line_mod = tokenizePunct ( line_mod ); } if ( !caseOn ) { if ( multiTxtDocumentParams.debugMode ) { cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "CASEON AT FALSE " << endl << "END DEBUG" << endl; } line_mod = lowerCase ( line_mod ); } if ( noPunct ) { if ( multiTxtDocumentParams.debugMode ) { cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "NOPUNCT AT TRUE " << endl << "END DEBUG" << endl; } if ( !tercomLike ) { line_mod = removePunctTercom ( line_mod ); } else { line_mod = removePunct ( line_mod ); } } if ( multiTxtDocumentParams.debugMode ) { cerr << "DEBUG multiTxtDocument::loadFile : line tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl; } vector<string> vecDocLine = stringToVector ( line_mod, " " ); // string l_key; // hashHypothesis.addValue(l_key,vecDocLine); // l_key=(string)vecDocLine.at((int)vecDocLine.size()-1); // vecDocLine.pop_back(); if ( multiTxtDocumentParams.debugMode ) { cerr << "DEBUG tercpp multiTxtDocument::loadFile : " << l_key << "|" << vectorToString ( vecDocLine ) << "|" << endl << "Vector Size : " << vecDocLine.size() << endl << "Line length : " << ( int ) line_mod.length() << endl << "END DEBUG" << endl; } // hashHypothesis.addValue(l_key,vecDocLine); segmentStructure l_seg ( l_key, vecDocLine, l_doc.getDocId() ); l_doc.addSegments ( l_seg ); } // Ref=line; // getline ( fichierHyp, line ); // Hyp=line; fichierLoad.close(); // on ferme le fichier addDocument ( l_doc ); if ( multiTxtDocumentParams.debugMode ) { cerr << "DEBUG multiTxtDocument::loadFile : document " << l_doc.getDocId() << " added !!!" << endl << "END DEBUG" << endl; } l_key.erase(); line_mod.erase(); l_stream.str(""); } else // sinon { cerr << "ERROR : multiTxtDocument::loadFile : can't open file : " + fileName + " !" << endl; exit ( 0 ); } }
void xmlStructure::copy_to_SGMLDocument ( SGMLDocument* sgmlDoc, TiXmlNode* pParent, unsigned int indent ) { if ( !pParent ) return; TiXmlNode* pChild; TiXmlText* pText; int t = pParent->Type(); // printf ( "%s", getIndent ( indent ) ); // int num; string elementValue; bool unknownTag=false; if ( xmlParams.debugMode ) { cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << " TiXmlNode: " << t << endl << "END DEBUG" << endl; cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << " indent: " << indent << endl << "END DEBUG" << endl; } switch ( t ) { case TiXmlNode::DOCUMENT: if ( xmlParams.debugMode ) { cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << " Document" << endl << "END DEBUG" << endl; }// printf ( "Document" ); break; case TiXmlNode::ELEMENT: // printf ( "Element [%s]", pParent->Value() ); elementValue = Tools::lowerCase ( pParent->Value() ); if ( xmlParams.debugMode ) { cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << " elementValue: " << elementValue << endl << "END DEBUG" << endl; } if ( ( ( int ) elementValue.compare ( "refset" ) == 0 ) || ( ( int ) elementValue.compare ( "tstset" ) == 0 ) ) { sgmlDoc->setDocType ( elementValue ); } else if (( ( int ) elementValue.compare ( "doc" ) == 0 ) || ( ( int ) elementValue.compare ( "DOC" ) == 0 )) { documentStructure tmp_doc; sgmlDoc->addDocument ( tmp_doc ); } else if ( ( ( int ) elementValue.compare ( "seg" ) == 0 ) || ( ( int ) elementValue.compare ( "SEG" ) == 0 ) ) { segmentStructure tmp_seg; ( sgmlDoc->getLastDocument() )->addSegments ( tmp_seg ); } else { unknownTag=true; } if ( xmlParams.debugMode ) { cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << " Calling dump_attribs_to_SGMLDocuments with indent :" << indent + 1 << endl << "END DEBUG" << endl; } if (!unknownTag) { dump_attribs_to_SGMLDocuments ( sgmlDoc, pParent->ToElement(), indent + 1 ); } // num = dump_attribs_to_stdout ( pParent->ToElement(), indent + 1 ); // switch ( num ) // { // case 0: // printf ( " (No attributes)" ); // break; // case 1: // printf ( "%s1 attribute", getIndentAlt ( indent ) ); // break; // default: // printf ( "%s%d attributes", getIndentAlt ( indent ), num ); // break; // } break; // case TiXmlNode::COMMENT: // printf ( "Comment: [%s]", pParent->Value() ); // break; // // case TiXmlNode::UNKNOWN: // printf ( "Unknown" ); // break; case TiXmlNode::TEXT: pText = pParent->ToText(); // printf ( "Text: [%s]", pText->Value() ); if ( indent >= 2 ) { documentStructure * l_tmp_doc = sgmlDoc->getLastDocument(); segmentStructure * l_tmp_seg = l_tmp_doc->getLastSegments(); string l_text = pText->Value(); string line_mod = l_text; if ( !xmlParams.tercomLike ) { if ( xmlParams.debugMode ) { cerr << "DEBUG xmlStructure::copy_to_SGMLDocument : line NOT tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl; } if ( xmlParams.debugMode ) { cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "TERCOM AT FALSE " << endl << "END DEBUG" << endl; } line_mod = tokenizePunct ( line_mod ); } if ( !xmlParams.caseOn ) { if ( xmlParams.debugMode ) { cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "CASEON AT FALSE " << endl << "END DEBUG" << endl; } line_mod = lowerCase ( line_mod ); } if ( xmlParams.noPunct ) { if ( xmlParams.debugMode ) { cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "NOPUNCT AT TRUE " << endl << "END DEBUG" << endl; } if ( !xmlParams.tercomLike ) { line_mod = removePunctTercom ( line_mod ); } else { line_mod = removePunct ( line_mod ); } } if ( xmlParams.debugMode ) { cerr << "DEBUG xmlStructure::copy_to_SGMLDocument : line tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl; } l_tmp_seg->addContent ( line_mod ); } break; // case TiXmlNode::DECLARATION: // printf ( "Declaration" ); // break; default: cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "Default TiXmlNode: " << t << endl << "END DEBUG" << endl; break; } // printf ( "\n" ); for ( pChild = pParent->FirstChild(); pChild != 0; pChild = pChild->NextSibling() ) { copy_to_SGMLDocument ( sgmlDoc, pChild, indent + 1 ); } }