/** * @brief * @param filename * @return */ bool CWikiMarkupParser::parse_document(CDocument& doc) { bool status = false; cout << "Parsing Wiki markup document using a state based tag parser" << endl; stringstream oss; string file_contents = ""; string line; string line2; vector<string> token_vec; string delim = "\r\n\t "; string filename = doc.get_resource_name(); string sem_wiki_fname; int doc_id; doc_id=doc.get_docID(); // cout<<"Doc_id:"<<doc_id; oss<<doc_id; std::string path; path=CUtilities::semwiki_dir_path+"/"; sem_wiki_fname=path+oss.str()+"_semwiki.txt"; //cout<<"the path for semwiki file is:"<<sem_wiki_fname<<endl; _semwiki_wiki_id = doc_id; int pos = filename.find_last_of('/'); string res = filename.substr(pos+1); int txt_loc = res.find(".txt"); _article_title = res.substr(0,txt_loc); ifstream ifs(filename.c_str()); if ( ifs.is_open() ) { while ( !ifs.eof() ) { getline( ifs, line ); file_contents += line+' '; if(line.find("<<Author>>")!=string::npos) { string aut; istringstream liness(line); getline(liness,aut,':'); getline(liness,aut,'\n'); pos = aut.find(','); if(pos==string::npos) _authors.insert(aut); else { string final_authors; string rem_stuff; while(pos!=string::npos) { // cout<<"inside while:"<<aut<<endl; final_authors = aut.substr(0,pos); // cout<<"final authors:"<<final_authors<<endl; rem_stuff = aut.substr(pos+1); // cout<<"rem_stuff:"<<rem_stuff<<endl; _authors.insert(final_authors); pos =rem_stuff.find(','); aut=rem_stuff; if(pos==string::npos) { _authors.insert(rem_stuff); break; } } } } if(line.find("<<Timestamp>>")!=string::npos) { string timestp; istringstream liness(line); getline(liness,timestp,':'); getline(liness,timestp,'\n'); _timestamp = timestp; } } } else { cout << "Couldn't open file " << filename << endl; } ifs.close(); // start parsing the file_contents cout << "file contents size = " << file_contents.size() << endl; status = parse(file_contents); ofstream ofs; ofs.open(sem_wiki_fname.c_str(),ios::trunc); ofs<<"<<#WikiFileId>>\n"; ofs<<_semwiki_wiki_id<<"\n"; ofs<<"<<#Article Title>>\n"; ofs<<_article_title<<"\n"; ofs<<"<<#Author>>\n"; std::set<std::string>::iterator ait; for(ait=_authors.begin();ait!=_authors.end();++ait) { if (_author_map.find(*ait)==_author_map.end()) _author_map[*ait]=++_author_id; ofs<<(*ait)<<" $ "; doc._author_ids.push_back(_author_map[*ait]); } ofs<<"\n<<#Timestamp>>\n"; ofs<<_timestamp<<"\n"; ofs<<"<<#Infobox>>\n"; set<string>::iterator i_itr; for ( i_itr = _infobox_details.begin(); i_itr != _infobox_details.end(); ++i_itr) { ofs<<*i_itr; } ofs<<"\n<<#Sections>>\n"; list<string>::iterator sec_itr; list<string>::iterator sec_det_itr; string temp1=""; for(sec_itr=_section_header.begin(),sec_det_itr=_section_details.begin();sec_itr!=_section_header.end()&&sec_det_itr!=_section_details.end();++sec_itr,++sec_det_itr) { temp1=*sec_itr+" $"; ofs<<temp1; ofs<<(*sec_det_itr); ofs<<"\n"; } ofs<<"<<#LINKS>>"<<"\n"; string temp =""; set<string>::iterator s_itr; size_t link_counter = 1; for ( s_itr = _outgoing_link_set.begin(); s_itr != _outgoing_link_set.end(); ++s_itr, link_counter++ ) { //cout << "linking to ==> (#" << link_counter << ") " << *s_itr << endl; temp=*s_itr+" $"; ofs<<temp; _link_set_map.insert(pair<std::string,std::string>(_article_title,temp)); temp=""; } ofs<<"\n<<#Categories>>\n"; map<string,int>::iterator cat_map; set<string>::iterator cat; for(cat = _categories.begin();cat!=_categories.end();++cat) { if (_category_map.find(*cat)==_category_map.end()) _category_map[*cat]=++_category_id; ofs<<*cat<<" $ "; doc._category_ids.push_back(_category_map[*cat]); } ofs.close(); _authors.clear(); _categories.clear(); _timestamp = ""; CUtilities::tokenize(file_contents, token_vec, delim); //cout << "Total number of raw tokens = " << token_vec.size() << endl; vector<std::string>::iterator raw_token_it; CBasicTokenProcessor* cbtp = new CBasicTokenProcessor(); for(raw_token_it = token_vec.begin();raw_token_it!=token_vec.end();++raw_token_it) { //cout<<"Token:"<<*raw_token_it<<endl; if(CDocument::_raw_token_id_map.find((*raw_token_it))==CDocument::_raw_token_id_map.end()) CDocument::_raw_token_id_map[(*raw_token_it)]=CDocument::_raw_token_id++; doc.add_token(*raw_token_it); } doc.process_token_list(*cbtp, doc); token_vec.clear(); return status; }