Tokens StrSplit(const std::string &src, const std::string &sep) { Tokens r; std::string s; for (std::string::const_iterator i = src.begin(); i != src.end(); i++) { if (sep.find(*i) != std::string::npos) { if (s.length()) r.push_back(s); s = ""; } else { s += *i; } } if (s.length()) r.push_back(s); return r; }
Tokens get_tokens( const String& s ) { Tokens res; char prev = ' '; String tmp; for ( auto c : s ) { if ( prev != ' ' && prev != c ) { res.push_back(tmp); prev = c; tmp = ""; tmp += c; } else { prev = c; tmp += c; } } if ( tmp != "" ) res.push_back(tmp); return res; }
Tokens StrSplit(const std::string& src, const std::string& sep) { Tokens r; std::string s; for (char i : src) { if (sep.find(i) != std::string::npos) { if (s.length()) r.push_back(s); s.clear(); } else { s += i; } } if (s.length()) r.push_back(s); return r; }
StringUtils::Tokens StringUtils::tokenize(const std::string& str, const std::string& delimiters) { Tokens tokens; std::string::size_type delimPos = 0, tokenPos = 0, pos = 0; if(str.length()<1) return tokens; while(1) { delimPos = str.find_first_of(delimiters, pos); tokenPos = str.find_first_not_of(delimiters, pos); if (tokenPos != std::string::npos && str[tokenPos]=='\"') { delimPos = str.find_first_of("\"", tokenPos+1); pos++; } if(std::string::npos != delimPos) { if(std::string::npos != tokenPos) { if(tokenPos<delimPos) { std::string token = str.substr(pos,delimPos-pos); if (token.length()) tokens.push_back(token); } } pos = delimPos+1; } else { if(std::string::npos != tokenPos) { std::string token = str.substr(pos); if (token.length()) tokens.push_back(token); } break; } } return tokens; }
Tokens Bytecode::lexer(const std::string& source) const { Strings lines; split(source, '\n', lines); Strings tokens_str; for (int i = 0; i < lines.size(); i++) { split(lines[i], ' ', tokens_str); tokens_str.push_back("\n"); } Tokens result; for (int i = 0; i < tokens_str.size(); i++) { result.push_back(makeToken(tokens_str[i])); } return result; }
int main() { Code code("//bla bli blo\nflap"); CodeRange cr(code); Tokens tokens; tokens.push_back(Token::Ptr(new Symbol(reduce(cr, 1)))); tokens.push_back(Token::Ptr(new Symbol(reduce(cr, 1)))); tokens.push_back(Token::Ptr(new Name(reduce(cr, 3)))); tokens.push_back(Token::Ptr(new Whitespace(reduce(cr, 1)))); tokens.push_back(Token::Ptr(new Name(reduce(cr, 3)))); tokens.push_back(Token::Ptr(new Whitespace(reduce(cr, 1)))); tokens.push_back(Token::Ptr(new Name(reduce(cr, 3)))); tokens.push_back(Token::Ptr(new Newline(reduce(cr, 1)))); tokens.push_back(Token::Ptr(new Name(reduce(cr, 4)))); TokenRange tr(tokens); auto comment = Comment::construct(tr); return 0; }
//static bool PipelineBuilder::extract(Tokens& tokens, int& index, int length, string& name, Tokens& tks, Range*& range) { // expect: // 1. name of a function // 2. name of a pre-defined pipeline // 3. expression in { ... } if (index >= length) THROW("Invalid pipeline expression: unfinished"); if (tokens[index].m_angen == ANGEN_EXPRESSION) { name = ""; tks.push_back(tokens[index]); if (index >= length-1) return false; index++; if (tokens[index].m_angen == ANGEN_PIPE) { index++; return true; } return false; } if (tokens[index].m_angen != ANGEN_IDENT) THROW("Invalid pipeline expression: expected name"); name = tokens[index].m_val.m_str; if (index >= length-1) return false; index++; // expect: // 1. ( function parameters // 2. | next processor in pipeline // 3. ^ range for a pre-defined pipeline if (tokens[index].m_angen == ANGEN_PIPE) { index++; return true; } if (tokens[index].m_angen == ANGEN_PAREN_OPEN) { if (index >= length-1) THROW("Invalid pipeline expression: missing parenthesis"); index++; int inside = 1; for ( ; (index < length) && (inside > 0); index++) { if (tokens[index].m_angen == ANGEN_PAREN_CLOSE) inside--; else if (tokens[index].m_angen == ANGEN_PAREN_OPEN) inside++; else tks.push_back(tokens[index]); } if (inside != 0) THROW("Expected closing parenthesis"); } if (index >= length-1) return false; if (tokens[index].m_angen == ANGEN_PIPE) { index++; return true; } if (tokens[index].m_angen == ANGEN_RANGE) { index++; if (index >= length) THROW("Invalid pipeline expression: unfinished range"); int period; PeriodType periodType = SECOND; if (tokens[index].m_angen == ANGEN_LONG) { if (tokens[index].m_val.m_long <= 0) THROW("Invalid period value"); period = tokens[index].m_val.m_long; } else if (tokens[index].m_angen == ANGEN_TITLE) { parsePeriod(tokens[index].m_val.m_str.c_str(), period, periodType); } else { THROW("Expected range value"); } range = new Range(period, periodType); index++; } if (index >= length-1) return false; if (tokens[index].m_angen == ANGEN_PIPE) { index++; return true; } return false; }
int main(int argc, char const *argv[]) { // If there are not enough args, return -1 if (argc < 5) { std::cerr << "Usage: P7 <corpus> <sentence> <dictionary> <n> <threshold> <delta> <model>" << '\n'; return -1; } // Otherwise, collect the function parameters string corpusFileName = argv[1]; string sentenceFileName = argv[2]; string dictionaryFileName = argv[3]; unsigned int n = stoi(argv[4]); unsigned int threshold = stoi(argv[5]); double delta = stod(argv[6]); bool model = stoi(argv[7]); // Capture all tokens Tokens corpusTokens; Tokens sentenceTokens; Tokens dictionaryTokens; read_tokens(corpusFileName, corpusTokens, false); read_tokens(sentenceFileName, sentenceTokens, true); read_tokens(dictionaryFileName, dictionaryTokens, false); if (corpusTokens.size() < n) { std::cerr << "\nInput file '" << corpusFileName << "' is too small to create any nGrams of size " << n; return -1; } if (sentenceTokens.size() < n) { std::cerr << "\nInput file '" << sentenceFileName << "' is too small to create any nGrams of size " << n; return -1; } unordered_map <string, int> vocabulary; unordered_map <string, int> dictionary; vector<Corpus> corpus = getCorpusList(corpusTokens, n); for (auto &word : corpusTokens) { if (vocabulary.count(word) == 0) vocabulary[word] = 1; } for (auto &word : dictionaryTokens) { if (dictionary.count(word) == 0) dictionary[word] = 1; } vector<double> probs; int V = vocabulary.size() + 1; double N = corpusTokens.size(); // Collect sentences vector<Tokens> sentences; Tokens sentence; for (auto &word : sentenceTokens) { if (word == EOS) { sentences.push_back(sentence); sentence.clear(); } else { sentence.push_back(word); } } // Proof sentences for (auto &sentence : sentences) { std::cout << "Sentence:\t"; for (auto &word : sentence) std::cout << word << ' '; std::cout << '\n'; // Check against all words within reasonable distance vector<Tokens> candidateWords; for (auto &word : sentence) { Tokens candidates; for (auto &candidate : dictionary) if (uiLevenshteinDistance(word, candidate.first) <= 1) candidates.push_back(candidate.first); candidateWords.push_back(candidates); } // Check that the produced sentences from the candidate words makes semantic sense vector<Tokens> candidateSentences; // for (auto &words : candidateWords) { // for (auto &word : words) { // Tokens temp = sentence; // temp // candidateSentences.push_back(temp) // } // } for (int i = 0; i < candidateWords.size(); i++) { for (auto &word : candidateWords[i]) { Tokens temp = sentence; temp[i] = word; candidateSentences.push_back(temp); } } double bestProb = -DBL_MAX; Tokens bestSentence; for (auto &sentence : candidateSentences) { double prob = getProb(corpus, sentence, n, delta, N, V, threshold, model); if (prob > bestProb) { bestProb = prob; bestSentence = sentence; } } std::cout << "Suggestion:\t"; for (auto &word : bestSentence) std::cout << word << " "; std::cout << "\n"; } return 0; }