// opt_negated_query_word -> NOT query_word // | query_word static syntree *opt_negated_query_word (token **lookahead, err_context context) { syntree *left = NULL; RETURN_NULL_IF_OVER_DEPTH_LIMIT(context); if ( ft_match (lookahead, TOK_FT_NOT) ) { left = query_word (lookahead, context); if (left) { return (ft_not_new (left)); } else { gen_cond_err_msg ("Error: The NOT operator can only precede a word or phrase" " inside the CONTAINS condition string.", context.count); return (NULL); } } return (query_word (lookahead, context)); }
void CateTeller::tell(char ** p_text, int n_text, int * p_labels) { int n_word = N_WORD; std::vector< std::vector<char *> * > v_lines_words; //�����еĴ�����б� std::vector< std::vector<int> * > v_class_tf; //ÿ������ÿ���г��ִ�Ƶ���б� std::vector< std::vector<double> * > v_featureVector; //�������� cut(&seg, p_text, n_text, v_lines_words, n_word, filter); //��ÿ���ʣ��ڴʵ���Ѱ����Ӧ��Ƶ�� for(int iSen = 0; iSen < v_lines_words.size(); iSen++) { std::vector<int> * pSen_fp = new std::vector<int>; v_class_tf.push_back(pSen_fp); //std::cout << "querying no." << iSen+1 << "\n"; char message_t[50]; //sprintf(message_t, "\rquerying no.%d", iSen+1); sprintf(message_t, "\r�����%d��", iSen+1); // std::cout << message_t; for( int iWord = 0; iWord < (*v_lines_words.at(iSen)).size(); iWord++ ) //ÿ����10���� { char * pWord = (*v_lines_words.at(iSen)).at(iWord); //����ݿ������Ƶ���ѯ query_word(conn, pWord); if( v_query.size() != N_DIMEN ) //������������ { for(int i = 0; i < N_DIMEN; i++) { pSen_fp->push_back( 0 ); } } else //����û�г������� { for(int i = 0; i < N_DIMEN; i++) { pSen_fp->push_back( v_query.at(i) ); } } } } //������������������� double class_n[N_DIMEN] = {13186.0, 133915.0, 29844.0, 14694.0, 235245}; //�ʵ���ÿ��ĸ��� //std::vector< std::vector<int> * > v_class_tf; //ÿ������ÿ���г��ִ�Ƶ���б� for(int iSen = 0; iSen < v_class_tf.size(); iSen++) { std::vector<int> * fp_thisSen = v_class_tf.at(iSen); //����������� std::vector<double> * pFeatureV = new std::vector<double>; //�������������м���������� v_featureVector.push_back(pFeatureV); for(int iWord = 0; iWord < 10; iWord++) //ÿ�乲10���� { double norm_fp[N_DIMEN]; //�����һ����TF double max_nfp = 0.0; //���TF double sum_nfp = 0.0; //TF֮�� for(int i = 0; i < N_DIMEN; i++) { norm_fp[i] = 10000.0 * (double)(fp_thisSen->at( N_DIMEN*iWord + i ))/class_n[i]; sum_nfp += norm_fp[i]; if(max_nfp < norm_fp[i]) { max_nfp = norm_fp[i]; } } max_nfp /= 10.0; //�dz���10��sigmoid���� double f1 = 2.0/( 1.0 + exp(-0.10986*max_nfp) ) - 1.0; if(sum_nfp != 0) { pFeatureV->push_back( f1 ); for(int i = 0; i < N_DIMEN; i++) { pFeatureV->push_back( norm_fp[i]/sum_nfp ); } } else { pFeatureV->push_back( f1 ); for(int i = 0; i < N_DIMEN; i++) { pFeatureV->push_back( 0.0 ); } } } } ////svm������~ for(int iFV = 0; iFV < v_featureVector.size(); iFV++) //����ÿ����¼ { std::vector<double> * pFV = v_featureVector.at(iFV); struct svm_node * svmData = (struct svm_node *)malloc( (50+1)*sizeof(struct svm_node) ); for(int i = 0; i < 50; i++) { svmData[i].index = i+1; svmData[i].value = pFV->at(i); } svmData[50].index = -1; int label = svm_predict(svmModel, svmData); //l_labels.push_back(label); p_labels[iFV] = label; //std::cout << iFV+1 << " : " << label << "\n"; free(svmData); } //// do some cleanup //�ͷ�ȡ���б�v_lines_words for( int iSen = 0; iSen < v_lines_words.size(); iSen++ ) { std::vector<char *> * p_vSen = v_lines_words.at(iSen); for(int iWord = 0; iWord < p_vSen->size(); iWord++) { delete [] (char*)(p_vSen->at(iWord)); } p_vSen->clear(); } v_lines_words.clear(); //�ͷŲ�ѯ��Ƶ�б� for( int iSen = 0; iSen < v_class_tf.size(); iSen++ ) { v_class_tf.at(iSen)->clear(); } v_class_tf.clear(); //�ͷ������������� for( int iVec = 0; iVec < v_featureVector.size(); iVec++ ) { v_featureVector.at(iVec)->clear(); } v_featureVector.clear(); v_query.clear(); }
//void textCategorization(std::list<char *>& l_text, std::list<int>& l_labels) //����˵����p_text Ϊn_text���ַ�ָ�룻 p_labelsΪn_text����ǩ��int��ָ�� void textCategorization_new(char ** p_text, int n_text, int * p_labels, char * outputPath) { int n_word = N_WORD; std::vector< std::vector<char *> * > v_lines_words; //�����еĴ�����б� std::vector< std::vector<int> * > v_class_tf; //ÿ������ÿ���г��ִ�Ƶ���б� std::vector< std::vector<double> * > v_featureVector; //�������� ////���õĹ��� //�ָ���seg CppJieba::MPSegment seg; //��ʼ�� //bool init_res = seg.init("C:\\languageData_new\\jieba.dict.utf8"); bool init_res = seg.init("dependency/jieba.dict.utf8"); ////��ʼ����ѯ����sqlite3�� sqlite3 * conn = NULL; //������ݿ� // char * err_msg = NULL; //����ʧ�ܵ�ԭ�� //����ݿ⣬�������� //if( SQLITE_OK != sqlite3_open("C:\\languageData_new\\new_dictionary.sqlite", &conn) ) if( SQLITE_OK != sqlite3_open("dependency/new_dictionary.sqlite", &conn) ) { printf("can't open the database."); exit(-1); } //������ //struct svm_model * svmModel = svm_load_model("C:\\languageData_new\\trainingSet.txt.model"); struct svm_model * svmModel = svm_load_model("dependency/trainingSet.txt.model"); //��������� //termFilter filter("C:\\languageData_new\\symbelTerms.txt"); termFilter filter("dependency/symbelTerms.txt"); //cut( �ָ����� �����ļ��� ����ļ��� ���ֵĴ� ) //cut(&seg, "title_utf8.txt", "title_res_utf8.txt", n_word); //�ֵ�ִ� //cut(&seg, l_text, v_lines_words, n_word); cut(&seg, p_text, n_text, v_lines_words, n_word, filter); /* //����ִʽ�� std::ofstream ofile1; ofile1.open("split_res.txt", std::ios::out); //std::vector< std::vector<char *> * > v_lines_words; for(int iSen = 0; iSen < v_lines_words.size(); iSen++) { std::vector<char *> * pSen = v_lines_words.at(iSen); for(int iWord = 0; iWord < pSen->size(); iWord++) { ofile1 << pSen->at(iWord) << "\n"; } } ofile1.close(); */ //��ÿ���ʣ��ڴʵ���Ѱ����Ӧ��Ƶ�� for(int iSen = 0; iSen < v_lines_words.size(); iSen++) { std::vector<int> * pSen_fp = new std::vector<int>; v_class_tf.push_back(pSen_fp); //std::cout << "querying no." << iSen+1 << "\n"; char message_t[50]; //sprintf(message_t, "\rquerying no.%d", iSen+1); sprintf(message_t, "\r�����%d��", iSen+1); std::cout << message_t; for( int iWord = 0; iWord < (*v_lines_words.at(iSen)).size(); iWord++ ) //ÿ����10���� { char * pWord = (*v_lines_words.at(iSen)).at(iWord); //����ݿ������Ƶ���ѯ query_word(conn, pWord); if( v_query.size() != N_DIMEN ) //������������ { for(int i = 0; i < N_DIMEN; i++) { pSen_fp->push_back( 0 ); } } else //����û�г������� { for(int i = 0; i < N_DIMEN; i++) { pSen_fp->push_back( v_query.at(i) ); } } } } /* //���������� //std::vector< std::vector<int> * > v_class_tf; //ÿ������ÿ���г��ִ�Ƶ���б� std::ofstream ofile; ofile.open( "frequency.txt", std::ios::out ); for(int iSen = 0; iSen < v_class_tf.size(); iSen++) { std::vector<int> * pSen_fp = v_class_tf.at( iSen ); for( int iTF = 0; iTF < pSen_fp->size(); iTF++ ) { ofile << pSen_fp->at(iTF) << "\t"; } ofile << "\n"; } ofile.close(); */ //������������������� double class_n[N_DIMEN] = {13186.0, 133915.0, 29844.0, 14694.0, 235245}; //�ʵ���ÿ��ĸ��� //std::vector< std::vector<int> * > v_class_tf; //ÿ������ÿ���г��ִ�Ƶ���б� for(int iSen = 0; iSen < v_class_tf.size(); iSen++) { std::vector<int> * fp_thisSen = v_class_tf.at(iSen); //����������� std::vector<double> * pFeatureV = new std::vector<double>; //�������������м���������� v_featureVector.push_back(pFeatureV); for(int iWord = 0; iWord < 10; iWord++) //ÿ�乲10���� { double norm_fp[N_DIMEN]; //�����һ����TF double max_nfp = 0.0; //���TF double sum_nfp = 0.0; //TF֮�� for(int i = 0; i < N_DIMEN; i++) { norm_fp[i] = 10000.0 * (double)(fp_thisSen->at( N_DIMEN*iWord + i ))/class_n[i]; sum_nfp += norm_fp[i]; if(max_nfp < norm_fp[i]) { max_nfp = norm_fp[i]; } } max_nfp /= 10.0; //�dz���10��sigmoid���� double f1 = 2.0/( 1.0 + exp(-0.10986*max_nfp) ) - 1.0; if(sum_nfp != 0) { pFeatureV->push_back( f1 ); for(int i = 0; i < N_DIMEN; i++) { pFeatureV->push_back( norm_fp[i]/sum_nfp ); } } else { pFeatureV->push_back( f1 ); for(int i = 0; i < N_DIMEN; i++) { pFeatureV->push_back( 0.0 ); } } } } /* //����������� //std::vector< std::vector<double> * > v_featureVector; //�������� std::ofstream ofile2; ofile2.open("featureVector.txt", std::ios::app); for(int iVec = 0; iVec < v_featureVector.size(); iVec++) { std::vector<double> * v_Vector = v_featureVector.at(iVec); for(int iCell = 0; iCell < v_Vector->size(); iCell++) { ofile2 << iCell+1 << ":" << v_Vector->at(iCell) << " "; } ofile2 << "\n"; } ofile2.close(); */ ////svm������~ for(int iFV = 0; iFV < v_featureVector.size(); iFV++) //����ÿ����¼ { std::vector<double> * pFV = v_featureVector.at(iFV); struct svm_node * svmData = (struct svm_node *)malloc( (50+1)*sizeof(struct svm_node) ); for(int i = 0; i < 50; i++) { svmData[i].index = i+1; svmData[i].value = pFV->at(i); } svmData[50].index = -1; int label = svm_predict(svmModel, svmData); //l_labels.push_back(label); p_labels[iFV] = label; //std::cout << iFV+1 << " : " << label << "\n"; free(svmData); } //���Ԥ����� if( outputPath != NULL ) { std::ofstream ofile3; ofile3.open( outputPath, std::ios::out ); for(int i = 0; i < n_text; i++) { ofile3 << p_labels[i] << "\n"; } ofile3.close(); } //���� /////�ͷŷִ��� seg.dispose(); //�ر�sqlite3���� if( SQLITE_OK != sqlite3_close(conn) ) { printf("can't close the database: %s/n", sqlite3_errmsg(conn)); exit(-1); } //�ͷŷ����� free(svmModel); //�ͷ�ȡ���б�v_lines_words for( int iSen = 0; iSen < v_lines_words.size(); iSen++ ) { std::vector<char *> * p_vSen = v_lines_words.at(iSen); for(int iWord = 0; iWord < p_vSen->size(); iWord++) { delete [] (char*)(p_vSen->at(iWord)); } p_vSen->clear(); } v_lines_words.clear(); //�ͷŲ�ѯ��Ƶ�б� for( int iSen = 0; iSen < v_class_tf.size(); iSen++ ) { v_class_tf.at(iSen)->clear(); } v_class_tf.clear(); //�ͷ������������� for( int iVec = 0; iVec < v_featureVector.size(); iVec++ ) { v_featureVector.at(iVec)->clear(); } v_featureVector.clear(); v_query.clear(); //return p_labels; }