void Misc::limpiarLabeled(const std::string& input, const std::string& output) { std::ifstream labeled(input.c_str()); std::ofstream cleanLabeled(output.c_str()); std::string header, line; std::vector<std::string> lineVec; int bytesLeidos = 0; int bytesTotales = 33556378; // Hardcodeadisimo std::cout << "LIMPIANDO LABELED...\n"; // Header: labeled >> header >> header >> header; labeled.ignore(1,'\n'); while (labeled.peek() != EOF) { std::getline(labeled, line); lineVec = split(line, '\t'); processText(lineVec[2]); removeStopwords(lineVec[2]); cleanLabeled << lineVec[0] + '\t' << lineVec[1] << '\t' << lineVec[2]; labeled.ignore(1, '\n'); bytesLeidos += line.size() + 1; std::cout << "\r" << (int)(bytesLeidos*100.0/bytesTotales) << "%"; if (labeled.peek() != EOF) cleanLabeled << '\n'; } std::cout << "\r100%\n"; }
// virtual void GNeighborTransducer::transduce(GData* pDataLabeled, GData* pDataUnlabeled, int labelDims) { if(labelDims != 1) ThrowError("Only 1 nominal label is supported"); if(!pDataLabeled->relation()->areNominal(pDataLabeled->relation()->size() - 1, 1)) ThrowError("Only nominal labels are supported"); if(!pDataLabeled->relation()->areContinuous(0, pDataLabeled->relation()->size() - 1)) ThrowError("Only continuous features are supported"); if(pDataLabeled->cols() != pDataUnlabeled->cols()) ThrowError("relations don't match"); // Make a dataset containing all rows GData dataAll(pDataLabeled->relation()); dataAll.reserve(pDataLabeled->rows() + pDataUnlabeled->rows()); GReleaseDataHolder hDataAll(&dataAll); for(size_t i = 0; i < pDataUnlabeled->rows(); i++) dataAll.takeRow(pDataUnlabeled->row(i)); for(size_t i = 0; i < pDataLabeled->rows(); i++) dataAll.takeRow(pDataLabeled->row(i)); int featureDims = pDataLabeled->cols() - labelDims; sp_relation pRelInputs = new GUniformRelation(featureDims, 0); dataAll.setRelation(pRelInputs); // Find friends GNeighborFinder* pNF; if(m_intrinsicDims == 0) pNF = new GNeighborFinderCacheWrapper(new GKdTree(&dataAll, 0, m_friendCount, NULL, true), true); else pNF = new GManifoldNeighborFinder( &dataAll, m_friendCount, // littleK m_friendCount * 4, // bigK m_intrinsicDims, // intrinsicDims m_alpha, // alpha m_beta, // beta false, // prune? m_pRand); Holder<GNeighborFinder> hNF(pNF); GTEMPBUF(size_t, neighbors, m_friendCount); int labelValues = pDataLabeled->relation()->valueCount(featureDims); GTEMPBUF(double, tallys, labelValues); // Label the unlabeled patterns GBitTable labeled(pDataUnlabeled->rows()); GData labelList(3); // pattern index, most likely label, confidence labelList.newRows(pDataUnlabeled->rows()); for(size_t i = 0; i < pDataUnlabeled->rows(); i++) labelList.row(i)[0] = i; while(labelList.rows() > 0) { // Compute the most likely label and the confidence for each pattern for(size_t i = 0; i < labelList.rows(); i++) { // Find the most common label double* pRow = labelList.row(i); size_t index = (size_t)pRow[0]; pNF->neighbors(neighbors, index); GVec::setAll(tallys, 0.0, labelValues); for(int j = 0; j < m_friendCount; j++) { if(neighbors[j] >= dataAll.rows()) continue; double* pFriend = dataAll.row(neighbors[j]); if(neighbors[j] >= pDataUnlabeled->rows()) { if((int)pFriend[featureDims] >= 0 && (int)pFriend[featureDims] < labelValues) tallys[(int)pFriend[featureDims]] += 1.0; } else if(labeled.bit(neighbors[j])) { if((int)pFriend[featureDims] >= 0 && (int)pFriend[featureDims] < labelValues) tallys[(int)pFriend[featureDims]] += 0.6; } } int label = GVec::indexOfMax(tallys, labelValues, m_pRand); double conf = tallys[label]; // Penalize for dissenting votes for(int j = 0; j < m_friendCount; j++) { if(neighbors[j] >= dataAll.rows()) continue; double* pFriend = dataAll.row(neighbors[j]); if(neighbors[j] >= pDataUnlabeled->rows()) { if((int)pFriend[featureDims] != label) conf *= 0.5; } else if(labeled.bit(neighbors[j])) { if((int)pFriend[featureDims] != label) conf *= 0.8; } } pRow[1] = label; pRow[2] = conf; } labelList.sort(2); // Assign the labels to the patterns we are most confident about size_t maxCount = MAX((size_t)5, pDataLabeled->rows() / 5); size_t count = 0; for(size_t i = labelList.rows() - 1; i < labelList.rows(); i--) { double* pRow = labelList.row(i); size_t index = (size_t)pRow[0]; int label = (int)pRow[1]; pDataUnlabeled->row(index)[featureDims] = label; labeled.set(index); labelList.deleteRow(i); if(count >= maxCount) break; count++; } } }