void loadPattern(const string &filename) { FILE* in = tryOpen(filename, "r"); for (;getLine(in);) { vector<string> tokens = splitBy(line, ','); string pattern = tolower(tokens[0]); int occurrence; fromString(tokens[1], occurrence); patterns[pattern] = occurrence; prob[pattern] = occurrence; quote[pattern] = parenthesis[pattern] = dash[pattern] = capital[pattern] = total[pattern] = 0; size_t tokensN = splitBy(pattern, ' ').size(); f[pattern].resize(tokensN, 0); sumOutside[pattern].resize(tokensN, 0); tree.add(" " + pattern + " "); } fclose(in); //cerr << "# Pattern = " << prob.size() << endl; tree.make(); //cerr << "Tree is built" << endl; }
Relation* Select(vector<string> &words, SchemaManager &schema_manager, MainMemory &mem){ vector<string> select_list, from_list, where_list, order_list; bool has_distinct = false, has_where = false, has_orderby = false; int i = 1; if (words[i] == "DISTINCT"){ has_distinct = true; i++; } while (i < words.size() && words[i] != "FROM"){ // drop comma select_list.push_back(splitBy(words[i], ",")[0]); i++; } i++; // skip FROM while ( i < words.size() && words[i] != "WHERE" && words[i] != "ORDER"){ from_list.push_back(splitBy(words[i], ",")[0]); i++; } if (i < words.size()){ if (words[i] == "WHERE"){ has_where = true; i++; // skip WHERE while (i < words.size() && words[i] != "ORDER"){ where_list.push_back(words[i]); i++; } } if (i < words.size() && words[i] == "ORDER"){ has_orderby = true; i = i + 2; // skip ORDER BY order_list.push_back(words[i]); i++; } } // add table name to each column name preProcess(from_list, select_list, schema_manager); preProcess(from_list, where_list, schema_manager); preProcess(from_list, order_list, schema_manager); /* print(select_list); print(from_list); print(where_list); print(order_list); */ Relation* view = generateLQP(has_distinct, select_list, from_list, where_list, order_list, schema_manager, mem); cout<<*view<<endl; return view; }
double segmentationByDP(const string &pattern, MAP_S_D &prob) { vector<string> tokens = splitBy(pattern, ' '); vector<double> f(tokens.size() + 1, 0); f[0] = 1; for (int i = 0; i < (int)tokens.size(); ++ i) { string phrase = ""; for (int j = i; j < (int)tokens.size(); ++ j) { if (phrase.size()) { phrase += " "; } phrase += tokens[j]; if (j - i + 1 == (int)tokens.size()) { continue; } if (prob.count(phrase)) { double cur = f[i] * prob[phrase] * penalty; // if (j + 1 != (int)tokens.size()) { // cur *= penalty; // } if (cur > f[j + 1]) { f[j + 1] = cur; } } } } return f[tokens.size()]; }
vector<string> printStopwords(MAP_S_D &patterns) { vector<string> ret(1, "avg_idf,stop_ratio,first_stop,last_stop"); char temp[1000]; FOR (iter, patterns) { const string &pattern = iter->first; if (pattern.find(' ') == string::npos) { continue; } vector<string> tokens = splitBy(pattern, ' '); double sum = 0; int cnt = 0, stopCnt = 0; FOR (token, tokens) { if (word2idf.count(*token)) { ++ cnt; sum += word2idf[*token]; } if (stopwords.count(*token)) { ++ stopCnt; } } if (cnt > 0) { sum /= cnt; } sprintf(temp, "%.10f,%.10f,%d,%d", sum, (double)stopCnt / tokens.size(), stopwords.count(tokens[0]), stopwords.count(tokens[tokens.size() - 1])); ret.push_back(temp); } return ret; }
void loadProb(const string &prefix) { for (int length = 1; length <= 6; ++ length) { char filename[255]; sprintf(filename, "%s%d.csv", prefix.c_str(), length); FILE* in = tryOpen(filename, "r"); if (in == NULL) { cerr << "[Warning] No length " << length << " phrases." << endl; continue; } getLine(in); double sum = 0; for (;getLine(in);) { vector<string> tokens = splitBy(line, ','); string pattern = tolower(tokens[0]); double value; fromString(tokens[2], value); prob[pattern] = value; sum += value; } //fprintf(stderr, "sum %d = %.6f\n", length, sum); fclose(in); } //cerr << "# prob = " << prob.size() << endl; }
void joinPaths(Str& relPath, const Str& basePath) { Str segment; DStr absPath; // append the relPath to all-but-the-last-segment-of-basePath Bool endSlash = cutLast(absPath = basePath, 1), lastSeg; DStr result = absPath + (endSlash? "" : "/") + relPath; // throw out all '.' from the path const char *p = (const char*) result; absPath.empty(); while(splitBy(p, slashes, segment)) { if (!segP(segment, 1)) absPath += segment + "/"; } if (!segP(segment, 1)) absPath += segment; // throw out all "something/.." from the path p = (char*) absPath; int depth = 0; result.empty(); do { lastSeg = (Bool) !splitBy(p, slashes, segment); if (!segP(segment, 2)) { result += segment + (lastSeg ? "" : "/"); depth++; } else { if (depth > 1) { cutLast(result, 2); depth--; } else result += segment + (lastSeg ? "" : "/"); }; } while(!lastSeg); relPath = result; }
vector<string> printStat(MAP_S_D &patterns, MAP_S_D &prob, unordered_map<string, vector<double> > &f, MAP_S_D &total, unordered_map<string, vector<double> > &sumOutside) { vector<string> ret(1, "prob_feature,occur_feature,log_occur_feature,prob_log_occur,constant,outsideSentence,new_outside,frequency"); char temp[1000]; FOR (iter, patterns) { const string &pattern = iter->first; if (pattern.find(' ') == string::npos) { continue; } string AB = ""; string CD = ""; double best = -1; for (size_t i = 0; i < pattern.size(); ++ i) { if (pattern[i] == ' ') { string left = pattern.substr(0, i); string right = pattern.substr(i + 1); double current = prob[left] * prob[right]; if (current > best) { best = current; AB = left; CD = right; } } } double f1 = prob[pattern] / (prob[AB] * prob[CD]); double f2 = iter->second / sqrt(patterns[AB] * patterns[CD]); double f3 = sqrt(iter->second) * log(f1); double f4 = prob[pattern] * log(f1); vector<string> tokens = splitBy(pattern, ' '); double sum = 0, norm = 0; for (size_t i = 0; i < tokens.size(); ++ i) { if (total[pattern] > 0) { f[pattern][i] /= total[pattern]; } double wi = log(prob[tokens[i]]); sum += sqr(f[pattern][i]) * sqr(wi); norm += sqr(wi); } double outside = sqrt(sum / norm); sum = norm = 0; for (size_t i = 0; i < tokens.size(); ++ i) { sum += sumOutside[pattern][i] * word2idf[tokens[i]]; norm += word2idf[tokens[i]]; } if (total[pattern] > 0) { sum /= total[pattern]; } double newOutside = sum / norm; sprintf(temp, "%.10f,%.10f,%.10f,%.10f,1,%.10f,%.10f,%.0f", f1, f2, f3, f4, outside, newOutside, iter->second); ret.push_back(temp); } return ret; }
URIScheme uri2SchemePath(Sit S, const char *absolute, Str& scheme, Str& rest) { Bool found = (Bool) !!splitBy(absolute, ":", scheme); sabassert(found); rest = (char*) absolute; /* * if (isSlash(*absolute) && isSlash(absolute[1])) * rest = (char*) absolute + 2; * else * rest = (char*) absolute; */ return schemeToURI_(S, scheme); }
Data(string line, bool isTrain) { vector<string> tokens = splitBy(line, ','); if (isTrain) { values = tokens; } else { for (int i = 0; i < tokens.size(); ++ i) { values.push_back(tokens[i]); if (i == 0) { values.push_back("0"); // fake a value for IsBadBuy for testing data } } } }
vector<string> loadFeatureTable(const string &filename) { FILE* in = tryOpen(filename, "r"); vector<string> ret; getLine(in); ret.push_back(line); for (;getLine(in);) { vector<string> tokens = splitBy(line, ','); string pattern = tolower(tokens[0]); patterns[pattern] = line; } FOR (iter, patterns) { ret.push_back(iter->second); }
void loadPattern(const string &filename) { FILE* in = tryOpen(filename, "r"); for (;getLine(in);) { vector<string> tokens = splitBy(line, ','); string pattern = tolower(tokens[0]); int occurrence; fromString(tokens[1], occurrence); oldProb[pattern] = occurrence; } fclose(in); // cerr << "# old prob = " << oldProb.size() << endl; }
void splitURI(const char *uri, FiveStr &parts) { const char *rest; char c; for (int i = 0; i < 5; i++) parts[i].empty(); RF( uri && *uri ); // extract the scheme part of the URI if (!splitBy(rest = uri, ":", parts[U_SCHEME])) parts[U_SCHEME].empty(); // if "//" follows, extract the authority part c = 'A'; // marks the absence of auth if (isSlash(*rest) && isSlash(rest[1])) RF( c = splitBy(rest += 2, slashes"?#", parts[U_AUTH]) ); if (isSlash(c) || c == 'A') // extract the path RF( c = splitBy(rest -= (isSlash(c)), "?#", parts[U_PATH]) ); //query and fragment if (c == '?') // extract the query RF( c = splitBy(rest, "#", parts[U_QUERY]) ); // copy the fragment parts[U_FRAG] = (char *) rest; };
void DataDrivenNumberFormatTestSuite::setTupleField(UErrorCode &status) { if (U_FAILURE(status)) { return; } UnicodeString parts[3]; int32_t partCount = splitBy(parts, UPRV_LENGTHOF(parts), 0x20); if (partCount < 3) { showError("Set expects 2 parameters"); status = U_PARSE_ERROR; return; } if (!fTuple.setField( NumberFormatTestTuple::getFieldByName(parts[1]), parts[2].unescape(), status)) { showError("Invalid field value"); } }
void loadStopwords(string stopFile, string idfFile) { FILE* in = tryOpen(idfFile, "r"); for (;getLine(in);) { vector<string> tokens = splitBy(line, ','); if (tokens.size() == 2) { string word = tokens[0]; double idf; fromString(tokens[1], idf); word2idf[word] = idf; } else { cerr << line << endl; } } in = tryOpen(stopFile, "r"); for (;getLine(in);) { stopwords[line] = 1; } }
void DataDrivenNumberFormatTestSuite::run(const char *fileName, UBool runAllTests) { fFileLineNumber = 0; fFormatTestNumber = 0; UErrorCode status = U_ZERO_ERROR; for (int32_t i = 0; i < UPRV_LENGTHOF(fPreviousFormatters); ++i) { delete fPreviousFormatters[i]; fPreviousFormatters[i] = newFormatter(status); } if (!assertSuccess("Can't create previous formatters", status)) { return; } CharString path(getSourceTestData(status), status); path.appendPathPart(fileName, status); const char *codePage = "UTF-8"; LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status)); if (!assertSuccess("Can't open data file", status)) { return; } UnicodeString columnValues[kNumberFormatTestTupleFieldCount]; ENumberFormatTestTupleField columnTypes[kNumberFormatTestTupleFieldCount]; int32_t columnCount; int32_t state = 0; while(U_SUCCESS(status)) { // Read a new line if necessary. if(fFileLine.isEmpty()) { if(!readLine(f.getAlias(), status)) { break; } if (fFileLine.isEmpty() && state == 2) { state = 0; } continue; } if (fFileLine.startsWith("//")) { fFileLine.remove(); continue; } // Initial setup of test. if (state == 0) { if (fFileLine.startsWith(UNICODE_STRING("test ", 5))) { fFileTestName = fFileLine; fTuple.clear(); } else if(fFileLine.startsWith(UNICODE_STRING("set ", 4))) { setTupleField(status); } else if(fFileLine.startsWith(UNICODE_STRING("begin", 5))) { state = 1; } else { showError("Unrecognized verb."); return; } // column specification } else if (state == 1) { columnCount = splitBy(columnValues, UPRV_LENGTHOF(columnValues), 0x9); for (int32_t i = 0; i < columnCount; ++i) { columnTypes[i] = NumberFormatTestTuple::getFieldByName( columnValues[i]); if (columnTypes[i] == kNumberFormatTestTupleFieldCount) { showError("Unrecognized field name."); return; } } state = 2; // run the tests } else { int32_t columnsInThisRow = splitBy(columnValues, columnCount, 0x9); for (int32_t i = 0; i < columnsInThisRow; ++i) { fTuple.setField( columnTypes[i], columnValues[i].unescape(), status); } for (int32_t i = columnsInThisRow; i < columnCount; ++i) { fTuple.clearField(columnTypes[i], status); } if (U_FAILURE(status)) { showError("Invalid column values"); return; } if (!breaksC() || runAllTests) { UnicodeString errorMessage; if (!isPass(fTuple, errorMessage, status)) { showFailure(errorMessage); } } } fFileLine.remove(); } }
Relation* Insert(vector<string> &words, string &line, SchemaManager &schema_manager, MainMemory &mem){ Relation* relation_ptr = schema_manager.getRelation(words[2]); vector<string>::iterator it = find(words.begin(), words.end(), "SELECT"); // no select if (it == words.end()){ // get insert vals vector<string> content = splitBy(line, "()"); vector<string> fields = splitBy(content[1], ", "); vector<string> vals = splitBy(content[3], ","); //preProcess(vector<string>(1, words[2]), fields, schema_manager); preProcess(vector<string>(1, words[2]), vals, schema_manager); assert(fields.size() == vals.size()); Tuple tuple = relation_ptr->createTuple(); // standard insert doesn't have table names vector<string> col_names = nakedFieldNames(relation_ptr); // comparing for (int i = 0; i < fields.size(); i++){ for (int j = 0; j < col_names.size(); j++){ // this is a match if (fields[i] == col_names[j]){ if (tuple.getSchema().getFieldType(j) == INT){ tuple.setField(j, atoi(vals[i].c_str())); } else{ tuple.setField(j, vals[i]); } break; } } } appendTupleToRelation(relation_ptr, mem, tuple); } // with SELECT else{ vector<string> SFW(it, words.end()); Relation* new_relation = Select(SFW, schema_manager, mem); assert(new_relation); vector<string> new_field_names = nakedFieldNames(new_relation); vector<string> field_names = nakedFieldNames(relation_ptr); // mapping: index of new_field_names to field_names vector<int> mapping(new_field_names.size(), -1); for (int i = 0; i < new_field_names.size(); i++){ for (int j = 0; j < field_names.size(); j++){ if (new_field_names[i] == field_names[j]){ mapping[i] = j; break; } } } int new_field_size = new_relation->getSchema().getNumOfFields(); // warning: new_relation and relation_ptr might be the same! // get all tuples from the new_relation in one run vector<Tuple> new_tuples; for (int i = 0; i < new_relation->getNumOfBlocks(); i++){ assert(!free_blocks.empty()); int memory_block_index = free_blocks.front(); free_blocks.pop(); // read the relation block by block new_relation->getBlock(i, memory_block_index); Block* block_ptr = mem.getBlock(memory_block_index); assert(block_ptr); vector<Tuple> block_tuples = block_ptr->getTuples(); new_tuples.insert(new_tuples.end(), block_tuples.begin(), block_tuples.end()); if(new_tuples.empty()){ cerr<<"Warning: Insert from SFW, No tuples in the current mem block!"<<endl; } free_blocks.push(memory_block_index); } for (int j = 0; j < new_tuples.size(); j++){ Tuple tuple = relation_ptr->createTuple(); for (int k = 0; k < new_field_size; k++){ if (mapping[k] != -1){ int idx = mapping[k]; assert(idx < relation_ptr->getSchema().getNumOfFields() && idx >= 0); if (tuple.getSchema().getFieldType(idx) == INT){ int val = new_tuples[j].getField(k).integer; tuple.setField(field_names[idx], val); } else{ string *str = new_tuples[j].getField(k).str; tuple.setField(field_names[idx], *str); } } } appendTupleToRelation(relation_ptr, mem, tuple); } cout<<*relation_ptr<<endl; } return relation_ptr; }
int main(int argc, char* argv[]) { if (argc != 6) { cerr << "[usage] <sentencesText.buf> <patterns.csv> <stopwords.txt> <stopwordsFromText.txt> <final.csv>" << endl; return -1; } selftest(); loadSentences(argv[1]); loadPattern(argv[2]); loadStopwords(argv[3], argv[4]); int corpusTokensN = 0; for (size_t sentenceID = 0; sentenceID < sentences.size(); ++ sentenceID) { const string &text = sentences[sentenceID]; string alpha = text; for (size_t i = 0; i < alpha.size(); ++ i) { if (isalpha(alpha[i])) { alpha[i] = tolower(alpha[i]); } else { if (alpha[i] != '\'') { alpha[i] = ' '; } } } corpusTokensN += splitBy(alpha, ' ').size(); string outsideText = alpha; if (sentenceID > 0) { outsideText += " " + sentences[sentenceID - 1]; } if (sentenceID + 1 < sentences.size()) { outsideText += " " + sentences[sentenceID + 1]; } for (size_t i = 0; i < outsideText.size(); ++ i) { if (isalpha(outsideText[i])) { outsideText[i] = tolower(outsideText[i]); } else { outsideText[i] = ' '; } } vector<string> outside = splitBy(outsideText, ' '); unordered_map<string, int> outsideCnt; FOR (token, outside) { ++ outsideCnt[*token]; } vector< pair<int, int> > positions; tree.search(" " + alpha + " ", positions); unordered_map<string, int> patternCnt; FOR (pos, positions) { int st = pos->first; int ed = pos->second - 2; string pattern = alpha.substr(st, ed - st); ++ patternCnt[pattern]; } FOR (pos, positions) { int st = pos->first; int ed = pos->second - 2; string pattern = alpha.substr(st, ed - st); vector<string> tokens = splitBy(pattern, ' '); unordered_map<string, int> tokenCnt; int delta = patternCnt[pattern]; for (size_t i = 0; i < tokens.size(); ++ i) { tokenCnt[tokens[i]] += delta; } for (size_t i = 0; i < tokens.size(); ++ i) { if (outsideCnt[tokens[i]] > tokenCnt[tokens[i]]) { f[pattern][i] += 1; sumOutside[pattern][i] += outsideCnt[tokens[i]] - tokenCnt[tokens[i]]; } } total[pattern] += 1; if (st > 0 && ed < (int)text.size()) { if (text[st - 1] == '(' && text[ed] == ')') { parenthesis[pattern] += 1; } if (text[st - 1] == '"' && text[ed] == '"') { quote[pattern] += 1; } } bool found = false; for (int i = st; i < ed && !found; ++ i) { found |= text[i] == '-'; } dash[pattern] += found; bool valid = true; for (int i = st; i < ed && valid; ++ i) { if (isalpha(alpha[i]) && (i == st || alpha[i - 1] == ' ')) { if (text[i] < 'A' && text[i] > 'Z') { valid = false; } } } capital[pattern] += valid; }