/* * Find the closest match in db, return as element of matches */ void SemanticDescriptor::findClosestMatch(Database & db, Matches & matches) { SemanticDescriptor bestMatch = (*(db.dDB.begin())).first; //matches.insert((*(db.dDB.begin())).second); SemanticDescriptor compareID; int matchStrength; map<SemanticDescriptor, string>::iterator iter; for (iter = db.dDB.begin(); iter != db.dDB.end(); ++iter) { compareID = iter->first; matchStrength = closerMatch(compareID, bestMatch, db); /* If compareID is a better match, reset matches */ if (matchStrength > 0) if (compareID.coverage(*this, db) >= .5) { matches.clear(); matches.insert(iter->second); bestMatch = compareID; } /* If compareID is an equal match, then add it to the set of matches */ if (matchStrength == 0) if (compareID.coverage(*this, db) >= .5) matches.insert(iter->second); } }
bool ImageTransformation::findHomography( const Keypoints& source, const Keypoints& result, const Matches& input, Matches& inliers, cv::Mat& homography) { if (input.size() < 8) return false; std::vector<cv::Point2f> srcPoints, dstPoints; const int pointsCount = input.size(); for (int i=0; i<pointsCount; i++) { srcPoints.push_back(source[input[i].trainIdx].pt); dstPoints.push_back(result[input[i].queryIdx].pt); } std::vector<unsigned char> status; cv::findHomography(srcPoints, dstPoints, CV_FM_RANSAC, 3, status); inliers.clear(); for (int i=0; i<pointsCount; i++) { if (status[i]) { inliers.push_back(input[i]); } } return true; }
void TestRunner::getHash(Testable* testable, unsigned char result[]){ Matches matches = testable->collect(); std::sort(matches.begin(), matches.end(), Match::sortFun); Matches::iterator it; SHA1_Init(&shaCtx); for(it=matches.begin(); it!=matches.end(); ++it){ std::string matchStr = (*it).toString(); SHA1_Update(&shaCtx, matchStr.c_str(), matchStr.size()); } SHA1_Final(result, &shaCtx); }
/* * Find exact matches for the semantic descriptor in the database */ bool SemanticDescriptor::findExactMatches(Database & db, Matches & matches) { SemanticDescriptor compareID; map<SemanticDescriptor, string>::iterator iter; for (iter = db.dDB.begin(); iter != db.dDB.end(); ++iter) { compareID = iter->first; if (equals(compareID, db)) { matches.insert(iter->second); } } return matches.size(); }
MatchPersistence MatchPersistenceFromJSON(const Value &value) { auto executableName = value["executableName"].GetString(); auto matcherName = value["matcherName"].GetString(); auto executableArchitecture = value["executableArchitecture"].GetString(); auto realTime = value["realTime"].GetDouble(); auto cpuTime = value["cpuTime"].GetDouble(); Matches matches; auto &matchesValue = value["matches"]; for (rapidjson::SizeType i = 0; i < matchesValue.Size(); ++i) { auto match = MatchFromJSON(matchesValue[i]); if (match) { matches.push_back(match); } } return MatchPersistence(executableName,matcherName,executableArchitecture,realTime,cpuTime,matches); }
bool computeMatchesDistanceStatistics(const Matches& matches, float& meanDistance, float& stdDev) { if (matches.empty()) return false; std::vector<float> distances(matches.size()); for (size_t i=0; i<matches.size(); i++) distances[i] = matches[i].distance; cv::Scalar mean, dev; cv::meanStdDev(distances, mean, dev); meanDistance = static_cast<float>(mean.val[0]); stdDev = static_cast<float>(dev.val[0]); return false; }
void ratioTest(const std::vector<Matches>& knMatches, float maxRatio, Matches& goodMatches) { goodMatches.clear(); for (size_t i=0; i< knMatches.size(); i++) { const cv::DMatch& best = knMatches[i][0]; const cv::DMatch& good = knMatches[i][1]; assert(best.distance <= good.distance); float ratio = (best.distance / good.distance); if (ratio <= maxRatio) { goodMatches.push_back(best); } } }
static void generateSummary( Summary &summary, char *htmlInput, const char *queryStr, const char *urlStr ) { Xml xml; ASSERT_TRUE(xml.set(htmlInput, strlen(htmlInput), 0, CT_HTML)); Words words; ASSERT_TRUE(words.set(&xml, true)); Bits bits; ASSERT_TRUE(bits.set(&words)); Url url; url.set(urlStr); Sections sections; ASSERT_TRUE(sections.set(&words, &bits, &url, "", CT_HTML)); Query query; ASSERT_TRUE(query.set2(queryStr, langEnglish, true)); LinkInfo linkInfo; memset ( &linkInfo , 0 , sizeof(LinkInfo) ); linkInfo.m_lisize = sizeof(LinkInfo); Title title; ASSERT_TRUE(title.setTitle(&xml, &words, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish)); Pos pos; ASSERT_TRUE(pos.set(&words)); Bits bitsForSummary; ASSERT_TRUE(bitsForSummary.setForSummary(&words)); Phrases phrases; ASSERT_TRUE(phrases.set(&words, &bits)); Matches matches; matches.setQuery(&query); ASSERT_TRUE(matches.set(&words, &phrases, §ions, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo)); summary.setSummary(&xml, &words, §ions, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen()); }
bool HybridTracker::Localize(const Marker& target, const Frame& scene, Matches& out) { vector<cv::DMatch> matches; methods.Match(target.descriptor, scene.descriptor, matches); out.clear(); for(auto& it : matches) { out._targetPts.push_back(target.keys[it.queryIdx]); out._scenePts.push_back(scene.keys[it.trainIdx]); out._error.push_back(it.distance); } return matches.size() > 20; }
/* * Function: initRouteMatrix() * Comments: Initializes global route matrix for this Solution */ void Solution::initRouteMatrix() { if(matrixInitType == FLUSH || myInternalMatrix.use_count() == 0) { Matches matches; Riders riders; for(MatchesMap::iterator it = myMatches.begin(); it != myMatches.end(); it++) { matches.push_back(it->second); for(Riders::iterator rider = it->second.confirmedRiders.begin(); rider != it->second.confirmedRiders.end(); rider++){ riders.push_back(*rider); } } for(RidersMap::iterator it = myRiders.begin(); it != myRiders.end(); it++) { riders.push_back(it->second); } Drivers emptyDrivers; if(useLocalMatrix){ myInternalMatrix.reset(new RouteMatrixLocal(emptyDrivers,riders,matches)); }else{ myInternalMatrix.reset(new RouteMatrix<>(emptyDrivers,riders,matches)); } } else { //std::cout << "Using old route Matrix" <<std::endl; } routeMatrix = myInternalMatrix; }
bool performEstimation ( const FeatureAlgorithm& alg, const ImageTransformation& transformation, const cv::Mat& sourceImage, std::vector<FrameMatchingStatistics>& stat ) { Keypoints sourceKp; Descriptors sourceDesc; cv::Mat gray; if (sourceImage.channels() == 3) cv::cvtColor(sourceImage, gray, CV_BGR2GRAY); else if (sourceImage.channels() == 4) cv::cvtColor(sourceImage, gray, CV_BGRA2GRAY); else if(sourceImage.channels() == 1) gray = sourceImage; if (!alg.extractFeatures(gray, sourceKp, sourceDesc)) return false; std::vector<float> x = transformation.getX(); stat.resize(x.size()); const int count = x.size(); cv::Mat transformedImage; Keypoints resKpReal; Descriptors resDesc; Matches matches; // To convert ticks to milliseconds const double toMsMul = 1000. / cv::getTickFrequency(); #pragma omp parallel for private(transformedImage, resKpReal, resDesc, matches) for (int i = 0; i < count; i++) { float arg = x[i]; FrameMatchingStatistics& s = stat[i]; transformation.transform(arg, gray, transformedImage); int64 start = cv::getTickCount(); alg.extractFeatures(transformedImage, resKpReal, resDesc); // Initialize required fields s.isValid = resKpReal.size() > 0; s.argumentValue = arg; if (!s.isValid) continue; if (alg.knMatchSupported) { std::vector<Matches> knMatches; alg.matchFeatures(sourceDesc, resDesc, 2, knMatches); ratioTest(knMatches, 0.75, matches); // Compute percent of false matches that were rejected by ratio test s.ratioTestFalseLevel = (float)(knMatches.size() - matches.size()) / (float) knMatches.size(); } else { alg.matchFeatures(sourceDesc, resDesc, matches); } int64 end = cv::getTickCount(); Matches correctMatches; cv::Mat homography; bool homographyFound = ImageTransformation::findHomography(sourceKp, resKpReal, matches, correctMatches, homography); // Some simple stat: s.isValid = homographyFound; s.totalKeypoints = resKpReal.size(); s.consumedTimeMs = (end - start) * toMsMul; // Compute overall percent of matched keypoints s.percentOfMatches = (float) matches.size() / (float)(std::min(sourceKp.size(), resKpReal.size())); s.correctMatchesPercent = (float) correctMatches.size() / (float)matches.size(); // Compute matching statistics computeMatchesDistanceStatistics(correctMatches, s.meanDistance, s.stdDevDistance); } return true; }
void LookupWidget::setCompletionItems(const Matches& matches) { ui_->word_input->completionObject()->setItems(matches.words()); }
typename PointMatcher<T>::ErrorMinimizer::ErrorElements& PointMatcher<T>::ErrorMinimizer::getMatchedPoints( const DataPoints& requestedPts, const DataPoints& sourcePts, const Matches& matches, const OutlierWeights& outlierWeights) { typedef typename Matches::Ids Ids; typedef typename Matches::Dists Dists; assert(matches.ids.rows() > 0); assert(matches.ids.cols() > 0); assert(matches.ids.cols() == requestedPts.features.cols()); //nbpts assert(outlierWeights.rows() == matches.ids.rows()); // knn const int knn = outlierWeights.rows(); const int dimFeat = requestedPts.features.rows(); const int dimReqDesc = requestedPts.descriptors.rows(); // Count points with no weights const int pointsCount = (outlierWeights.array() != 0.0).count(); if (pointsCount == 0) throw ConvergenceError("ErrorMnimizer: no point to minimize"); Matrix keptFeat(dimFeat, pointsCount); Matrix keptDesc; if(dimReqDesc > 0) keptDesc = Matrix(dimReqDesc, pointsCount); Matches keptMatches (Dists(1,pointsCount), Ids(1, pointsCount)); OutlierWeights keptWeights(1, pointsCount); int j = 0; int rejectedMatchCount = 0; int rejectedPointCount = 0; bool matchExist = false; this->weightedPointUsedRatio = 0; for (int i = 0; i < requestedPts.features.cols(); ++i) //nb pts { matchExist = false; for(int k = 0; k < knn; k++) // knn { if (outlierWeights(k,i) != 0.0) { if(dimReqDesc > 0) keptDesc.col(j) = requestedPts.descriptors.col(i); keptFeat.col(j) = requestedPts.features.col(i); keptMatches.ids(0, j) = matches.ids(k, i); keptMatches.dists(0, j) = matches.dists(k, i); keptWeights(0,j) = outlierWeights(k,i); ++j; this->weightedPointUsedRatio += outlierWeights(k,i); matchExist = true; } else { rejectedMatchCount++; } } if(matchExist == false) { rejectedPointCount++; } } assert(j == pointsCount); this->pointUsedRatio = double(j)/double(knn*requestedPts.features.cols()); this->weightedPointUsedRatio /= double(knn*requestedPts.features.cols()); assert(dimFeat == sourcePts.features.rows()); const int dimSourDesc = sourcePts.descriptors.rows(); Matrix associatedFeat(dimFeat, pointsCount); Matrix associatedDesc; if(dimSourDesc > 0) associatedDesc = Matrix(dimSourDesc, pointsCount); // Fetch matched points for (int i = 0; i < pointsCount; ++i) { const int refIndex(keptMatches.ids(i)); associatedFeat.col(i) = sourcePts.features.block(0, refIndex, dimFeat, 1); if(dimSourDesc > 0) associatedDesc.col(i) = sourcePts.descriptors.block(0, refIndex, dimSourDesc, 1); } this->lastErrorElements.reading = DataPoints( keptFeat, requestedPts.featureLabels, keptDesc, requestedPts.descriptorLabels ); this->lastErrorElements.reference = DataPoints( associatedFeat, sourcePts.featureLabels, associatedDesc, sourcePts.descriptorLabels ); this->lastErrorElements.weights = keptWeights; this->lastErrorElements.matches = keptMatches; this->lastErrorElements.nbRejectedMatches = rejectedMatchCount; this->lastErrorElements.nbRejectedPoints = rejectedPointCount; return this->lastErrorElements; }
void InspectorsImpl<T>::AbstractVTKInspector::dumpDataLinks( const DataPoints& ref, const DataPoints& reading, const Matches& matches, const OutlierWeights& featureOutlierWeights, std::ostream& stream) { const Matrix& refFeatures(ref.features); const int refPtCount(refFeatures.cols()); //const int featDim(refFeatures.rows()); const Matrix& readingFeatures(reading.features); const int readingPtCount(readingFeatures.cols()); const int totalPtCount(refPtCount+readingPtCount); stream << "# vtk DataFile Version 3.0\n"; stream << "comment\n"; stream << "ASCII\n"; stream << "DATASET POLYDATA\n"; stream << "POINTS " << totalPtCount << " float\n"; if(refFeatures.rows() == 4) { // reference pt stream << refFeatures.topLeftCorner(3, refFeatures.cols()).transpose() << "\n"; // reading pt stream << readingFeatures.topLeftCorner(3, readingFeatures.cols()).transpose() << "\n"; } else { // reference pt stream << refFeatures.transpose() << "\n"; // reading pt stream << readingFeatures.transpose() << "\n"; } const int knn = matches.ids.rows(); stream << "LINES " << readingPtCount*knn << " " << readingPtCount*knn * 3 << "\n"; //int j = 0; for(int k = 0; k < knn; k++) // knn { for (int i = 0; i < readingPtCount; ++i) { stream << "2 " << refPtCount + i << " " << matches.ids(k, i) << "\n"; } } stream << "CELL_DATA " << readingPtCount*knn << "\n"; stream << "SCALARS outlier float 1\n"; stream << "LOOKUP_TABLE default\n"; //stream << "LOOKUP_TABLE alphaOutlier\n"; for(int k = 0; k < knn; k++) // knn { for (int i = 0; i < readingPtCount; ++i) //nb pts { stream << featureOutlierWeights(k, i) << "\n"; } } //stream << "LOOKUP_TABLE alphaOutlier 2\n"; //stream << "1 0 0 0.5\n"; //stream << "0 1 0 1\n"; }
int main() { std::string port=":9000"; int listenQueueBacklog = 400; if ( FCGX_Init() ) { exit( 1 ); } int listen_socket = FCGX_OpenSocket( port.c_str(), listenQueueBacklog ); if ( listen_socket < 0 ) { exit( 1 ); } FCGX_Request request; if ( FCGX_InitRequest( &request, listen_socket, 0 ) ) { exit( 1 ); } std::string header = "Content-type: text/html\r\n\r\n"; NumTable db; std::map<Number,Number> posts; try { sql::Driver *driver; sql::Connection *con; sql::Statement *stmt; sql::ResultSet *res; driver = get_driver_instance(); con = driver->connect( "tcp://127.0.0.1:3306", "wiki_bot", "31415" ); con->setSchema( "orpheus" ); stmt = con->createStatement(); res = stmt->executeQuery( "SELECT hash, track_id FROM hashes_all" ); while ( res->next() ) { Number h = res->getUInt64( "hash" ); db.push_back( h ); posts[h] = res->getUInt64( "track_id" ); } delete res; delete stmt; delete con; } catch ( sql::SQLException &e ) { std::cout << "# ERR: SQLException in " << __FILE__; std::cout << "(" << __FUNCTION__ << ") on line " << __LINE__ << std::endl; std::cout << "# ERR: " << e.what(); std::cout << " (MySQL error code: " << e.getErrorCode(); std::cout << ", SQLState: " << e.getSQLState() << " )" << std::endl; } std::cout << "Start building." << std::endl; HEngine_sn e( 7 ); e.build( db ); std::cout << "Building done." << std::endl; while ( FCGX_Accept_r( &request ) == 0 ) { std::cout << "Have request. " << std::endl; NumTable q; Number hash; std::string query = FCGX_GetParam( "QUERY_STRING", request.envp ); std::stringstream ss( query ); while ( ss >> hash ) { q.push_back( hash ); if ( ss.peek() == ',' ) { ss.ignore(); } } std::string body = header; int c = 0; for ( auto &h: q ) { Matches res = e.query( h ); c += res.size(); for ( auto &r: res ) { body += std::to_string( posts[r.first] ) + ":" + std::to_string( r.first ) + ":" + std::to_string( r.second ) + "<br/>"; } } FCGX_PutS( body.c_str(), request.out ); FCGX_Finish_r( &request ); } return 0; }
bool ImageTransformation::findHomography( const Keypoints& source, const Keypoints& result, const Matches& input, Matches& inliers, cv::Mat& homography) { if (input.size() < 4) return false; const int pointsCount = input.size(); const float reprojectionThreshold = 2; //Prepare src and dst points std::vector<cv::Point2f> srcPoints, dstPoints; for (int i = 0; i < pointsCount; i++) { srcPoints.push_back(source[input[i].trainIdx].pt); dstPoints.push_back(result[input[i].queryIdx].pt); } // Find homography using RANSAC algorithm std::vector<unsigned char> status; homography = cv::findHomography(srcPoints, dstPoints, cv::RANSAC, reprojectionThreshold, status); // Warp dstPoints to srcPoints domain using inverted homography transformation std::vector<cv::Point2f> srcReprojected; cv::perspectiveTransform(dstPoints, srcReprojected, homography.inv()); // Pass only matches with low reprojection error (less than reprojectionThreshold value in pixels) inliers.clear(); for (int i = 0; i < pointsCount; i++) { cv::Point2f actual = srcPoints[i]; cv::Point2f expect = srcReprojected[i]; cv::Point2f v = actual - expect; float distanceSquared = v.dot(v); if (/*status[i] && */distanceSquared <= reprojectionThreshold * reprojectionThreshold) { inliers.push_back(input[i]); } } // Test for bad case if (inliers.size() < 4) return false; // Now use only good points to find refined homography: std::vector<cv::Point2f> refinedSrc, refinedDst; for (int i = 0; i < inliers.size(); i++) { refinedSrc.push_back(source[inliers[i].trainIdx].pt); refinedDst.push_back(result[inliers[i].queryIdx].pt); } // Use least squares method to find precise homography cv::Mat homography2 = cv::findHomography(refinedSrc, refinedDst, 0, reprojectionThreshold); // Reproject again: cv::perspectiveTransform(dstPoints, srcReprojected, homography2.inv()); inliers.clear(); for (int i = 0; i < pointsCount; i++) { cv::Point2f actual = srcPoints[i]; cv::Point2f expect = srcReprojected[i]; cv::Point2f v = actual - expect; float distanceSquared = v.dot(v); if (distanceSquared <= reprojectionThreshold * reprojectionThreshold) { inliers.push_back(input[i]); } } homography = homography2; return inliers.size() >= 4; }
int main( int argc, char **argv ) { float userTime; struct rusage startTime, stopTime; getrusage( RUSAGE_SELF, &startTime ); if ( argc != 4 ) { std::cout << "Usage: " << argv[0] << " <k> <data file> <query file>" << std::endl << std::endl; return 1; } unsigned k = atoi( argv[1] ); std::string line; std::cout << "Reading the dataset ........ "; fflush( stdout ); NumTable db; std::ifstream dict; dict.open( argv[2], std::ifstream::in ); while ( getline( dict, line ) ) { Number h; std::istringstream reader( line ); reader >> h; db.push_back( h ); } NumTable q; std::ifstream qdict; qdict.open( argv[3], std::ifstream::in ); while ( getline( qdict, line ) ) { Number h; std::istringstream reader( line ); reader >> h; q.push_back( h ); } std::cout << "done. " << db.size() << " db hashes and " << q.size() << " query hashes." << std::endl; std::cout << "Building with " << k << " hamming distance bound ....... "; fflush( stdout ); HEngine_sn e( k ); e.build( db ); std::cout << "done." << std::endl; getrusage( RUSAGE_SELF, &stopTime ); userTime = ( (float) ( stopTime.ru_utime.tv_sec - startTime.ru_utime.tv_sec ) ) + ( (float) ( stopTime.ru_utime.tv_usec - startTime.ru_utime.tv_usec ) ) * 1e-6; std::cout << std::endl; std::cout << "Building time: " << userTime << " seconds" << std::endl; std::cout << std::endl; std::cout << "Searching HEngine matches ......." << std::endl; getrusage( RUSAGE_SELF, &startTime ); int c = 0; for ( auto &h: q ) { Matches res = e.query( h ); c += res.size(); /*for ( auto &d : res ) { //std::cout << "[" << d.second << "] "<< std::endl;// << HEngine::binStr2Number( h ) << " -> " << HEngine::binStr2Number( d.first ) << std::endl; c++; }*/ } getrusage( RUSAGE_SELF, &stopTime ); userTime = ( (float) ( stopTime.ru_utime.tv_sec - startTime.ru_utime.tv_sec ) ) + ( (float) ( stopTime.ru_utime.tv_usec - startTime.ru_utime.tv_usec ) ) * 1e-6; std::cout << "found " << c << " total matches. HEngine query time: " << userTime << " seconds" << std::endl << std::endl; std::cout << "Searching linear matches ......." << std::endl; getrusage( RUSAGE_SELF, &startTime ); c = 0; for ( auto &item: db ) { for ( auto &h: q ) { unsigned d = HEngine::getHammingDistance( h, item ); if ( d <= k ) { c++; } } } getrusage( RUSAGE_SELF, &stopTime ); userTime = ( (float) ( stopTime.ru_utime.tv_sec - startTime.ru_utime.tv_sec ) ) + ( (float) ( stopTime.ru_utime.tv_usec - startTime.ru_utime.tv_usec ) ) * 1e-6; std::cout << "found " << c << " total matches. Linear query time: " << userTime << " seconds" << std::endl << std::endl; std::cout << std::endl; return 0; }
typename ErrorMinimizersImpl<T>::Matrix ErrorMinimizersImpl<T>::PointToPlaneWithCovErrorMinimizer::estimateCovariance(const DataPoints& reading, const DataPoints& reference, const Matches& matches, const OutlierWeights& outlierWeights, const TransformationParameters& transformation) { int max_nbr_point = outlierWeights.cols(); Matrix covariance(Matrix::Zero(6,6)); Matrix J_hessian(Matrix::Zero(6,6)); Matrix d2J_dReadingdX(Matrix::Zero(6, max_nbr_point)); Matrix d2J_dReferencedX(Matrix::Zero(6, max_nbr_point)); Vector reading_point(Vector::Zero(3)); Vector reference_point(Vector::Zero(3)); Vector normal(3); Vector reading_direction(Vector::Zero(3)); Vector reference_direction(Vector::Zero(3)); Matrix normals = reference.getDescriptorViewByName("normals"); if (normals.rows() < 3) // Make sure there are normals in DataPoints return std::numeric_limits<T>::max() * Matrix::Identity(6,6); T beta = -asin(transformation(2,0)); T alpha = atan2(transformation(2,1), transformation(2,2)); T gamma = atan2(transformation(1,0)/cos(beta), transformation(0,0)/cos(beta)); T t_x = transformation(0,3); T t_y = transformation(1,3); T t_z = transformation(2,3); Vector tmp_vector_6(Vector::Zero(6)); int valid_points_count = 0; for(int i = 0; i < max_nbr_point; ++i) { if (outlierWeights(0,i) > 0.0) { reading_point = reading.features.block(0,i,3,1); int reference_idx = matches.ids(0,i); reference_point = reference.features.block(0,reference_idx,3,1); normal = normals.block(0,reference_idx,3,1); T reading_range = reading_point.norm(); reading_direction = reading_point / reading_range; T reference_range = reference_point.norm(); reference_direction = reference_point / reference_range; T n_alpha = normal(2)*reading_direction(1) - normal(1)*reading_direction(2); T n_beta = normal(0)*reading_direction(2) - normal(2)*reading_direction(0); T n_gamma = normal(1)*reading_direction(0) - normal(0)*reading_direction(1); T E = normal(0)*(reading_point(0) - gamma*reading_point(1) + beta*reading_point(2) + t_x - reference_point(0)); E += normal(1)*(gamma*reading_point(0) + reading_point(1) - alpha*reading_point(2) + t_y - reference_point(1)); E += normal(2)*(-beta*reading_point(0) + alpha*reading_point(1) + reading_point(2) + t_z - reference_point(2)); T N_reading = normal(0)*(reading_direction(0) - gamma*reading_direction(1) + beta*reading_direction(2)); N_reading += normal(1)*(gamma*reading_direction(0) + reading_direction(1) - alpha*reading_direction(2)); N_reading += normal(2)*(-beta*reading_direction(0) + alpha*reading_direction(1) + reading_direction(2)); T N_reference = -(normal(0)*reference_direction(0) + normal(1)*reference_direction(1) + normal(2)*reference_direction(2)); // update the hessian and d2J/dzdx tmp_vector_6 << normal(0), normal(1), normal(2), reading_range * n_alpha, reading_range * n_beta, reading_range * n_gamma; J_hessian += tmp_vector_6 * tmp_vector_6.transpose(); tmp_vector_6 << normal(0) * N_reading, normal(1) * N_reading, normal(2) * N_reading, n_alpha * (E + reading_range * N_reading), n_beta * (E + reading_range * N_reading), n_gamma * (E + reading_range * N_reading); d2J_dReadingdX.block(0,valid_points_count,6,1) = tmp_vector_6; tmp_vector_6 << normal(0) * N_reference, normal(1) * N_reference, normal(2) * N_reference, reference_range * n_alpha * N_reference, reference_range * n_beta * N_reference, reference_range * n_gamma * N_reference; d2J_dReferencedX.block(0,valid_points_count,6,1) = tmp_vector_6; valid_points_count++; } // if (outlierWeights(0,i) > 0.0) } Matrix d2J_dZdX(Matrix::Zero(6, 2 * valid_points_count)); d2J_dZdX.block(0,0,6,valid_points_count) = d2J_dReadingdX.block(0,0,6,valid_points_count); d2J_dZdX.block(0,valid_points_count,6,valid_points_count) = d2J_dReferencedX.block(0,0,6,valid_points_count); Matrix inv_J_hessian = J_hessian.inverse(); covariance = d2J_dZdX * d2J_dZdX.transpose(); covariance = inv_J_hessian * covariance * inv_J_hessian; return (sensorStdDev * sensorStdDev) * covariance; }
void Solution::initMyMatches(const Matches& matches){ for (size_t ii = 0; ii < matches.size(); ii++){ myMatches[matches[ii].myDriver.routeId] = matches[ii]; } }
// returns false and sets g_errno on error bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, Query *query, LinkInfo *linkInfo, Url *firstUrl, const char *filteredRootTitleBuf, int32_t filteredRootTitleBufSize, uint8_t contentType, uint8_t langId, int32_t niceness ) { // make Msg20.cpp faster if it is just has // Msg20Request::m_setForLinkInfo set to true, no need to extricate a title. if ( maxTitleLen <= 0 ) { return true; } m_niceness = niceness; m_maxTitleLen = maxTitleLen; // if this is too big the "first line" algo can be huge!!! // and really slow everything way down with a huge title candidate int32_t maxTitleWords = 128; // assume no title reset(); int32_t NW = words->getNumWords(); // // now get all the candidates // // . allow up to 100 title CANDIDATES // . "as" is the word # of the first word in the candidate // . "bs" is the word # of the last word IN the candidate PLUS ONE int32_t n = 0; int32_t as[MAX_TIT_CANDIDATES]; int32_t bs[MAX_TIT_CANDIDATES]; float scores[MAX_TIT_CANDIDATES]; Words *cptrs[MAX_TIT_CANDIDATES]; int32_t types[MAX_TIT_CANDIDATES]; int32_t parent[MAX_TIT_CANDIDATES]; // record the scoring algos effects float baseScore [MAX_TIT_CANDIDATES]; float noCapsBoost [MAX_TIT_CANDIDATES]; float qtermsBoost [MAX_TIT_CANDIDATES]; float inCommonCandBoost[MAX_TIT_CANDIDATES]; // reset these for ( int32_t i = 0 ; i < MAX_TIT_CANDIDATES ; i++ ) { // assume no parent parent[i] = -1; } // xml and words class for each link info, rss item Xml tx[MAX_TIT_CANDIDATES]; Words tw[MAX_TIT_CANDIDATES]; int32_t ti = 0; // restrict how many link texts and rss blobs we check for titles // because title recs like www.google.com have hundreds and can // really slow things down to like 50ms for title generation int32_t kcount = 0; int32_t rcount = 0; //int64_t x = gettimeofdayInMilliseconds(); // . get every link text // . TODO: repeat for linkInfo2, the imported link text for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) { // breathe QUICKPOLL(m_niceness); // fast skip check for link text if ( k->size_linkText >= 3 && ++kcount >= 20 ) continue; // fast skip check for rss item if ( k->size_rssItem > 10 && ++rcount >= 20 ) continue; // set Url Url u; u.set( k->getUrl(), k->size_urlBuf ); // is it the same host as us? bool sh = true; // skip if not from same host and should be if ( firstUrl->getHostLen() != u.getHostLen() ) { sh = false; } // skip if not from same host and should be if ( strncmp( firstUrl->getHost(), u.getHost(), u.getHostLen() ) ) { sh = false; } // get the link text if ( k->size_linkText >= 3 ) { char *p = k->getLinkText(); int32_t plen = k->size_linkText - 1; if ( ! verifyUtf8 ( p , plen ) ) { log("title: set4 bad link text from url=%s", k->getUrl()); continue; } // now the words. if ( !tw[ti].set( k->getLinkText(), k->size_linkText - 1, true, 0 ) ) { return false; } // set the bookends, it is the whole thing cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); // score higher if same host if ( sh ) scores[n] = 1.05; // do not count so high if remote! else scores[n] = 0.80; // set the type if ( sh ) types [n] = TT_LINKTEXTLOCAL; else types [n] = TT_LINKTEXTREMOTE; // another candidate n++; // use xml and words ti++; // break out if too many already. save some for below. if ( n + 30 >= MAX_TIT_CANDIDATES ) break; } // get the rss item if ( k->size_rssItem <= 10 ) continue; // . returns false and sets g_errno on error // . use a 0 for niceness if ( ! k->setXmlFromRSS ( &tx[ti] , 0 ) ) return false; // get the word range int32_t tslen; bool isHtmlEnc; char *ts = tx[ti].getRSSTitle ( &tslen , &isHtmlEnc ); // skip if not in the rss if ( ! ts ) continue; // skip if empty if ( tslen <= 0 ) continue; // now set words to that if ( !tw[ti].set( ts, tslen, true, 0 ) ) { return false; } // point to that cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); // increment since we are using it ti++; // base score for rss title if ( sh ) scores[n] = 5.0; // if not same host, treat like link text else scores[n] = 2.0; // set the type if ( sh ) types [n] = TT_RSSITEMLOCAL; else types [n] = TT_RSSITEMREMOTE; // advance n++; // break out if too many already. save some for below. if ( n + 30 >= MAX_TIT_CANDIDATES ) break; } //logf(LOG_DEBUG,"title: took1=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); // . set the flags array // . indicates what words are in title candidates already, but // that is set below // . up here we set words that are not allowed to be in candidates, // like words that are in a link that is not a self link // . alloc for it char *flags = NULL; char localBuf[10000]; int32_t need = words->getNumWords(); if ( need <= 10000 ) { flags = (char *)localBuf; } else { flags = (char *)mmalloc(need,"TITLEflags"); } if ( ! flags ) { return false; } // clear it memset ( flags , 0 , need ); // check tags in body nodeid_t *tids = words->getTagIds(); // scan to set link text flags // loop over all "words" in the html body char inLink = false; char selfLink = false; for ( int32_t i = 0 ; i < NW ; i++ ) { // breathe QUICKPOLL(m_niceness); // if in a link that is not self link, cannot be in a candidate if ( inLink && ! selfLink ) { flags[i] |= 0x02; } // out of a link if ( tids[i] == (TAG_A | BACKBIT) ) { inLink = false; } // if not start of <a> tag, skip it if ( tids[i] != TAG_A ) { continue; } // flag it inLink = true; // get the node in the xml int32_t xn = words->getNodes()[i]; // is it a self link? int32_t len; char *link = xml->getString(xn,"href",&len); // . set the url class to this // . TODO: use the base url in the doc Url u; u.set( link, len, true, false ); // compare selfLink = u.equals ( firstUrl ); // skip if not selfLink if ( ! selfLink ) { continue; } // if it is a selflink , check for an "onClick" tag in the // anchor tag to fix that Mixx issue for: // http://www.npr.org/templates/story/story.php?storyId=5417137 int32_t oclen; char *oc = xml->getString(xn,"onclick",&oclen); if ( ! oc ) { oc = xml->getString(xn,"onClick",&oclen); } // assume not a self link if we see that... if ( oc ) { selfLink = false; } // if this <a href> link has a "title" attribute, use that // instead! that thing is solid gold. int32_t atlen; char *atitle = xml->getString(xn,"title",&atlen); // stop and use that, this thing is gold! if ( ! atitle || atlen <= 0 ) { continue; } // craziness? ignore it... if ( atlen > 400 ) { continue; } // if it contains permanent, permalink or share, ignore it! if ( strncasestr ( atitle, "permalink", atlen ) || strncasestr ( atitle,"permanent", atlen) || strncasestr ( atitle,"share", atlen) ) { continue; } // do not count the link text as viable selfLink = false; // aw, dammit if ( ti >= MAX_TIT_CANDIDATES ) { continue; } // other dammit if ( n >= MAX_TIT_CANDIDATES ) { break; } // ok, process it if ( ! tw[ti].set ( atitle, atlen, true, 0 )) { return false; } // set the bookends, it is the whole thing cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); scores [n] = 3.0; // not ALWAYS solid gold! types [n] = TT_TITLEATT; // we are using the words class ti++; // advance n++; // break out if too many already. save some for below. if ( n + 20 >= MAX_TIT_CANDIDATES ) { break; } } //logf(LOG_DEBUG,"title: took2=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); //int64_t *wids = WW->getWordIds(); // . find the last positive scoring guy // . do not consider title candidates after "r" if "r" is non-zero // . FIXES http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/ // the candidate # of the title tag int32_t tti = -1; // allow up to 4 tags from each type char table[512]; // sanity check if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; } // clear table counts memset ( table , 0 , 512 ); // the first word char *wstart = NULL; if ( NW > 0 ) { wstart = words->getWord(0); } // loop over all "words" in the html body for ( int32_t i = 0 ; i < NW ; i++ ) { // come back up here if we encounter another "title-ish" tag // within our first alleged "title-ish" tag subloop: // stop after 30k of text if ( words->getWord(i) - wstart > 200000 ) { break; // 1106 } // get the tag id minus the back tag bit nodeid_t tid = tids[i] & BACKBITCOMP; // pen up and pen down for these comment like tags if ( tid == TAG_SCRIPT || tid == TAG_STYLE ) { // ignore "titles" in script or style tags if ( ! (tids[i] & BACKBIT) ) { continue; } } /// @todo ALC we should allow more tags than just title/link // skip if not a good tag. if (tid != TAG_TITLE && tid != TAG_A) { continue; } // must NOT be a back tag if ( tids[i] & BACKBIT ) { continue; } // skip if we hit our limit if ( table[tid] >= 4 ) { continue; } // skip over tag/word #i i++; // no words in links, unless it is a self link if ( i < NW && (flags[i] & 0x02) ) { continue; } // the start should be here int32_t start = -1; // do not go too far int32_t max = i + 200; // find the corresponding back tag for it for ( ; i < NW && i < max ; i++ ) { // hey we got it, BUT we got no alnum word first // so the thing was empty, so loop back to subloop if ( (tids[i] & BACKBITCOMP) == tid && (tids[i] & BACKBIT ) && start == -1 ) { goto subloop; } // if we hit another title-ish tag, loop back up if ( (tids[i] & BACKBITCOMP) == TAG_TITLE || (tids[i] & BACKBITCOMP) == TAG_A ) { // if no alnum text, restart at the top if ( start == -1 ) { goto subloop; } // otherwise, break out and see if title works break; } // if we hit a breaking tag... if ( isBreakingTagId ( tids[i] & BACKBITCOMP ) && // do not consider <span> tags breaking for // our purposes. i saw a <h1><span> setup before. tids[i] != TAG_SPAN ) { break; } // skip if not alnum word if ( ! words->isAlnum(i) ) { continue; } // if we hit an alnum word, break out if ( start == -1 ) { start = i; } } // if no start was found, must have had a 0 score in there if ( start == -1 ) { continue; } // if we exhausted the doc, we are done if ( i >= NW ) { break; } // skip if way too big! if ( i >= max ) { continue; } // if was too long do not consider a title if ( i - start > 300 ) { continue; } // . skip if too many bytes // . this does not include the length of word #i, but #(i-1) if ( words->getStringSize ( start , i ) > 1000 ) { continue; } // when using pdftohtml, the title tag is the filename when PDF property does not have title tag if ( tid == TAG_TITLE && contentType == CT_PDF ) { // skip if title == '/in.[0-9]*' char* title_start = words->getWord(start); char* title_end = words->getWord(i); size_t title_size = title_end - title_start; const char* result = strnstr( title_start, "/in.", title_size ); if (result != NULL) { char* endp = NULL; // do some further verification to avoid screwing up title if ((strtoll(result + 4, &endp, 10) > 0) && (endp == title_end)) { continue; } } } // count it table[tid]++; // max it out if we are positive scoring. stop after the // first positive scoring guy in a section. this might // hurt the "Hamlet" thing though... // store a point to the title tag guy. Msg20.cpp needs this // because the zak's proximity algo uses it in Summary.cpp // and in Msg20.cpp // only get the first one! often the 2nd on is in an iframe!! which we now expand into here. if ( tid == TAG_TITLE && m_titleTagStart == -1 ) { m_titleTagStart = start; m_titleTagEnd = i; // save the candidate # because we always use this // as the title if we are a root if ( tti < 0 ) { tti = n; } } // point to words class of the body that was passed in to us cptrs[n] = words; as[n] = start; bs[n] = i; if ( tid == TAG_B ) { types[n] = TT_BOLDTAG; scores[n] = 1.0; } else if ( tid == TAG_H1 ) { types[n] = TT_HTAG; scores[n] = 1.8; } else if ( tid == TAG_H2 ) { types[n] = TT_HTAG; scores[n] = 1.7; } else if ( tid == TAG_H3 ) { types[n] = TT_HTAG; scores[n] = 1.6; } else if ( tid == TAG_TITLE ) { types[n] = TT_TITLETAG; scores[n] = 3.0; } else if ( tid == TAG_DIV ) { types[n] = TT_DIVTAG; scores[n] = 1.0; } else if ( tid == TAG_TD ) { types[n] = TT_TDTAG; scores[n] = 1.0; } else if ( tid == TAG_P ) { types[n] = TT_PTAG; scores[n] = 1.0; } else if ( tid == TAG_FONT ) { types[n] = TT_FONTTAG; scores[n] = 1.0; } else if ( tid == TAG_A ) { types[n] = TT_ATAG; // . self link is very powerful BUT // http://www.npr.org/templates/story/story.php?storyId=5417137 // doesn't use it right! so use // 1.3 instead of 3.0. that has an "onClick" thing in the // <a> tag, so check for that! // this was bad for // http://www.spiritualwoman.net/?cat=191 // so i am demoting from 3.0 to 1.5 scores[n] = 1.5; } // count it n++; // start loop over at tag #i, for loop does an i++, so negate // that so this will work i--; // break out if too many already. save some for below. if ( n + 10 >= MAX_TIT_CANDIDATES ) { break; } } //logf(LOG_DEBUG,"title: took3=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); // to handle text documents, throw in the first line of text // as a title candidate, just make the score really low bool textDoc = (contentType == CT_UNKNOWN || contentType == CT_TEXT); if (textDoc) { // make "i" point to first alphabetical word in the document int32_t i ; for ( i = 0 ; i < NW && !words->isAlpha(i) ; i++); // if we got a first alphabetical word, then assume that to be the start of our title if ( i < NW && n < MAX_TIT_CANDIDATES ) { // first word in title is "t0" int32_t t0 = i; // find end of first line int32_t numWords = 0; // set i to the end now. we MUST find a \n to terminate the // title, otherwise we will not have a valid title while (i < NW && numWords < maxTitleWords && (words->isAlnum(i) || !words->hasChar(i, '\n'))) { if(words->isAlnum(i)) { numWords++; } ++i; } // "t1" is the end int32_t t1 = -1; // we must have found our \n in order to set "t1" if (i <= NW && numWords < maxTitleWords ) { t1 = i; } // set the ptrs cptrs [n] = words; // this is the last resort i guess... scores [n] = 0.5; types [n] = TT_FIRSTLINE; as [n] = t0; bs [n] = t1; // add it as a candidate if t0 and t1 were valid if (t0 >= 0 && t1 > t0) { n++; } } } //logf(LOG_DEBUG,"title: took4=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); { // now add the last url path to contain underscores or hyphens char *pstart = firstUrl->getPath(); // get first url Url *fu = firstUrl; // start at the end char *p = fu->getUrl() + fu->getUrlLen(); // end pointer char *pend = NULL; // come up here for each path component while ( p >= pstart ) { // save end pend = p; // skip over / if ( *p == '/' ) { p--; } // now go back to next / int32_t count = 0; for ( ; p >= pstart && *p !='/' ; p-- ) { if ( *p == '_' || *p == '-' ) { count++; } } // did we get it? if ( count > 0 ) { break; } } // did we get any? if ( p > pstart && n < MAX_TIT_CANDIDATES ) { // now set words to that if ( ! tw[ti].set ( p, (pend - p), true, 0 )) { return false; } // point to that cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); scores [n] = 1.0; types [n] = TT_URLPATH; // increment since we are using it ti++; // advance n++; } } // save old n int32_t oldn = n; // . do not split titles if we are a root url maps.yahoo.com was getting "Maps" for the title if ( firstUrl->isRoot() ) { oldn = -2; } // point to list of \0 separated titles const char *rootTitleBuf = NULL; const char *rootTitleBufEnd = NULL; // get the root title if we are not root! if (filteredRootTitleBuf) { #ifdef _VALGRIND_ VALGRIND_CHECK_MEM_IS_DEFINED(filteredRootTitleBuf,filteredRootTitleBufSize); #endif // point to list of \0 separated titles rootTitleBuf = filteredRootTitleBuf; rootTitleBufEnd = filteredRootTitleBuf + filteredRootTitleBufSize; } { Matches m; if ( rootTitleBuf && query ) { m.setQuery ( query ); } // convert into an array int32_t nr = 0; const char *pr = rootTitleBuf; const char *rootTitles[20]; int32_t rootTitleLens[20]; // loop over each root title segment for ( ; pr && pr < rootTitleBufEnd ; pr += strnlen(pr,rootTitleBufEnd-pr) + 1 ) { // if we had a query... if ( query ) { // reset it m.reset(); // see if root title segment has query terms in it m.addMatches ( const_cast<char*>(pr), strnlen(pr,rootTitleBufEnd-pr), MF_TITLEGEN, m_niceness ); // if matches query, do NOT add it, we only add it for // removing from the title of the page... if ( m.getNumMatches() ) { continue; } } // point to it. it should start with an alnum already // since it is the "filtered" list of root titles... // if not, fix it in xmldoc then. rootTitles [nr] = pr; rootTitleLens[nr] = gbstrlen(pr); // advance nr++; // no breaching if ( nr >= 20 ) break; } // now split up candidates in children candidates by tokenizing // using :, | and - as delimters. // the hyphen must have a space on at least one side, so "cd-rom" does // not create a pair of tokens... // FIX: for the title: // Best Careers 2009: Librarian - US News and World Report // we need to recognize "Best Careers 2009: Librarian" as a subtitle // otherwise we don't get it as the title. so my question is are we // going to have to do all the permutations at some point? for now // let's just add in pairs... for ( int32_t i = 0 ; i < oldn && n + 3 < MAX_TIT_CANDIDATES ; i++ ) { // stop if no root title segments if ( nr <= 0 ) break; // get the word info Words *w = cptrs[i]; int32_t a = as[i]; int32_t b = bs[i]; // init int32_t lasta = a; char prev = false; // char length in bytes //int32_t charlen = 1; // see how many we add int32_t added = 0; char *skipTo = NULL; bool qualified = true; // . scan the words looking for a token // . sometimes the candidates end in ": " so put in "k < b-1" // . made this from k<b-1 to k<b to fix // "Hot Tub Time Machine (2010) - IMDb" to strip IMDb for ( int32_t k = a ; k < b && n + 3 < MAX_TIT_CANDIDATES; k++){ // get word char *wp = w->getWord(k); // skip if not alnum if ( ! w->isAlnum(k) ) { // in order for next alnum word to // qualify for "clipping" if it matches // the root title, there has to be more // than just spaces here, some punct. // otherwise title // "T. D. Jakes: Biography from Answers.com" // becomes // "T. D. Jakes: Biography from" qualified=isWordQualified(wp,w->getWordLen(k)); continue; } // gotta be qualified! if ( ! qualified ) continue; // skip if in root title if ( skipTo && wp < skipTo ) continue; // does this match any root page title segments? int32_t j; for ( j = 0 ; j < nr ; j++ ) { // . compare to root title // . break out if we matched! if ( ! strncmp( wp, rootTitles[j], rootTitleLens[j] ) ) { break; } } // if we did not match a root title segment, // keep on chugging if ( j >= nr ) continue; // . we got a root title match! // . skip over skipTo = wp + rootTitleLens[j]; // must land on qualified punct then!! int32_t e = k+1; for ( ; e<b && w->getWord(e)<skipTo ; e++ ); // ok, word #e must be a qualified punct if ( e<b && ! isWordQualified(w->getWord(e),w->getWordLen(e))) // assume no match then!! continue; // if we had a previous guy, reset the end of the // previous candidate if ( prev ) { bs[n-2] = k; bs[n-1] = k; } // . ok, we got two more candidates // . well, only one more if this is not the 1st time if ( ! prev ) { cptrs [n] = cptrs [i]; scores [n] = scores [i]; types [n] = types [i]; as [n] = lasta; bs [n] = k; parent [n] = i; n++; added++; } // the 2nd one cptrs [n] = cptrs [i]; scores [n] = scores [i]; types [n] = types [i]; as [n] = e + 1; bs [n] = bs [i]; parent [n] = i; n++; added++; // now add in the last pair as a whole token cptrs [n] = cptrs [i]; scores [n] = scores [i]; types [n] = types [i]; as [n] = lasta; bs [n] = bs [i]; parent [n] = i; n++; added++; // nuke the current candidate then since it got // split up to not contain the root title... //cptrs[i] = NULL; // update this lasta = k+1; // if we encounter another delimeter we will have to revise bs[n-1], so note that prev = true; } // nuke the current candidate then since it got // split up to not contain the root title... if ( added ) { scores[i] = 0.001; //cptrs[i] = NULL; } // erase the pair if that there was only one token if ( added == 3 ) n--; } } for ( int32_t i = 0 ; i < n ; i++ ) baseScore[i] = scores[i]; // // . now punish by 0.85 for every lower case non-stop word it has // . reward by 1.1 if has a non-stopword in the query // for ( int32_t i = 0 ; i < n ; i++ ) { // point to the words Words *w = cptrs[i]; // skip if got nuked above if ( ! w ) { continue; } // the word ptrs char **wptrs = w->getWordPtrs(); // skip if empty if ( w->getNumWords() <= 0 ) { continue; } // get the word boundaries int32_t a = as[i]; int32_t b = bs[i]; // record the boosts float ncb = 1.0; float qtb = 1.0; // a flag char uncapped = false; // scan the words in this title candidate for ( int32_t j = a ; j < b ; j++ ) { // skip stop words if ( w->isQueryStopWord( j, langId ) ) { continue; } // punish if uncapitalized non-stopword if ( ! w->isCapitalized(j) ) { uncapped = true; } // skip if no query if ( ! query ) { continue; } int64_t wid = w->getWordId(j); // reward if in the query if ( query->getWordNum(wid) >= 0 ) { qtb *= 1.5; scores[i] *= 1.5; } } // . only punish once if missing a capitalized word hurts us for: // http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html if ( uncapped ) { ncb *= 1.00; scores[i] *= 1.00; } // punish if a http:// title thingy char *s = wptrs[a]; int32_t size = w->getStringSize(a,b); if ( size > 9 && memcmp("http://", s, 7) == 0 ) { ncb *= .10; } if ( size > 14 && memcmp("h\0t\0t\0p\0:\0/\0/", s, 14) == 0 ) { ncb *= .10; } // set these guys scores[i] *= ncb; noCapsBoost[i] = ncb; qtermsBoost[i] = qtb; } // . now compare each candidate to the other candidates // . give a boost if matches for ( int32_t i = 0 ; i < n ; i++ ) { // point to the words Words *w1 = cptrs[i]; // skip if got nuked above if ( ! w1 ) { continue; } int32_t a1 = as[i]; int32_t b1 = bs[i]; // reset some flags char localFlag1 = 0; char localFlag2 = 0; // record the boost float iccb = 1.0; // total boost float total = 1.0; // to each other candidate for ( int32_t j = 0 ; j < n ; j++ ) { // not to ourselves if ( j == i ) { continue; } // or our derivatives if ( parent[j] == i ) { continue; } // or derivates to their parent if ( parent[i] == j ) { continue; } // only check parents now. do not check kids. // this was only for when doing percent contained // not getSimilarity() per se //if ( parent[j] != -1 ) continue; // TODO: do not accumulate boosts from a parent // and its kids, subtitles... // // do not compare type X to type Y if ( types[i] == TT_TITLETAG ) { if ( types[j] == TT_TITLETAG ) { continue; } } // do not compare a div candidate to another div cand // http://friendfeed.com/foxiewire?start=30 // likewise, a TD to another TD // http://content-uk.cricinfo.com/ausvrsa2008_09/engine/match/351681.html // ... etc. if ( types[i] == TT_BOLDTAG || types[i] == TT_HTAG || types[i] == TT_DIVTAG || types[i] == TT_TDTAG || types[i] == TT_FONTTAG ) { if ( types[j] == types[i] ) continue; } // . do not compare one kid to another kid // . i.e. if we got "x | y" as a title and "x | z" // as a link text, it will emphasize "x" too much // http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html if ( parent[j] != -1 && parent[i] != -1 ) continue; // . body type tags are mostly mutually exclusive // . for the legacy.com url mentioned below, we have // good stuff in <td> tags, so this hurts us... // . but for the sake of // http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/ // i put bold tags back if ( types[i] == TT_LINKTEXTLOCAL ) { if ( types[j] == TT_LINKTEXTLOCAL ) continue; } if ( types[i] == TT_RSSITEMLOCAL ) { if ( types[j] == TT_RSSITEMLOCAL ) continue; } // only compare to one local link text for each i if ( types[j] == TT_LINKTEXTLOCAL && localFlag1 ) { continue; } if ( types[j] == TT_RSSITEMLOCAL && localFlag2 ) { continue; } if ( types[j] == TT_LINKTEXTLOCAL ) { localFlag1 = 1; } if ( types[j] == TT_RSSITEMLOCAL ) { localFlag2 = 1; } // not link title attr to link title attr either // fixes http://www.spiritualwoman.net/?cat=191 if ( types[i] == TT_TITLEATT && types[j] == TT_TITLEATT ) continue; // get our words Words *w2 = cptrs[j]; // skip if got nuked above if ( ! w2 ) continue; int32_t a2 = as [j]; int32_t b2 = bs [j]; // how similar is title #i to title #j ? float fp = getSimilarity ( w2 , a2 , b2 , w1 , a1 , b1 ); // error? if ( fp == -1.0 ) return false; // custom boosting... float boost = 1.0; if ( fp >= .95 ) boost = 3.0; else if ( fp >= .90 ) boost = 2.0; else if ( fp >= .85 ) boost = 1.5; else if ( fp >= .80 ) boost = 1.4; else if ( fp >= .75 ) boost = 1.3; else if ( fp >= .70 ) boost = 1.2; else if ( fp >= .60 ) boost = 1.1; else if ( fp >= .50 ) boost = 1.08; else if ( fp >= .40 ) boost = 1.04; // limit total total *= boost; if ( total > 100.0 ) break; // if you are matching the url path, that is pretty // good so give more! // actually, that would hurt: // http://michellemalkin.com/2008/12/29/gag-worthy/ // custom boosting! if ( fp > 0.0 && g_conf.m_logDebugTitle ) logf(LOG_DEBUG,"title: i=%" PRId32" j=%" PRId32" fp=%.02f " "b=%.02f", i,j,fp,boost); // apply it scores[i] *= boost; iccb *= boost; } inCommonCandBoost[i] = iccb; } //logf(LOG_DEBUG,"title: took7=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); // loop over all n candidates for ( int32_t i = 0 ; i < n ; i++ ) { // skip if not in the document body if ( cptrs[i] != words ) continue; // point to the words int32_t a1 = as [i]; int32_t b1 = bs [i]; // . loop through this candidates words // . TODO: use memset here? for ( int32_t j = a1 ; j <= b1 && j < NW ; j++ ) { // flag it flags[j] |= 0x01; } } // free our stuff if ( flags!=localBuf ) { mfree (flags, need, "TITLEflags"); } // now get the highest scoring candidate title float max = -1.0; int32_t winner = -1; for ( int32_t i = 0 ; i < n ; i++ ) { // skip if got nuked if ( ! cptrs[i] ) { continue; } if ( winner != -1 && scores[i] <= max ) { continue; } // url path's cannot be titles in and of themselves if ( types[i] == TT_URLPATH ) { continue; } // skip if empty basically, like if title was exact // copy of root, then the whole thing got nuked and // some empty string added, where a > b if ( as[i] >= bs[i] ) { continue; } // got one max = scores[i]; // save it winner = i; } // if we are a root, always pick the title tag as the title if ( oldn == -2 && tti >= 0 ) { winner = tti; } // if no winner, all done. no title if ( winner == -1 ) { // last resort use file name if ((contentType == CT_PDF) && (firstUrl->getFilenameLen() != 0)) { Words w; w.set(firstUrl->getFilename(), firstUrl->getFilenameLen(), true); if (!copyTitle(&w, 0, w.getNumWords())) { return false; } } return true; } // point to the words class of the winner Words *w = cptrs[winner]; // skip if got nuked above if ( ! w ) { char *xx=NULL;*xx=0; } // need to make our own Pos class if title not from body Pos tp; if ( w != words ) { // set "Scores" ptr to NULL. we assume all are positive scores if ( ! tp.set ( w ) ) { return false; } } // the string ranges from word #a up to and including word #b int32_t a = as[winner]; int32_t b = bs[winner]; // sanity check if ( a < 0 || b > w->getNumWords() ) { char*xx=NULL;*xx=0; } // save the title if ( ! copyTitle(w, a, b) ) { return false; } /* // debug logging SafeBuf sb; SafeBuf *pbuf = &sb; log("title: candidates for %s",xd->getFirstUrl()->getUrl() ); pbuf->safePrintf("<div stype=\"border:1px solid black\">"); pbuf->safePrintf("<b>***Finding Title***</b><br>\n"); pbuf->safePrintf("<table cellpadding=5 border=2><tr>" "<td colspan=20><center><b>Title Generation</b>" "</center></td>" "</tr>\n<tr>" "<td>#</td>" "<td>type</td>" "<td>parent</td>" "<td>base score</td>" "<td>format penalty</td>" "<td>query term boost</td>" "<td>candidate intersection boost</td>" "<td>FINAL SCORE</td>" "<td>title</td>" "</tr>\n" ); // print out all candidates for ( int32_t i = 0 ; i < n ; i++ ) { char *ts = "unknown"; if ( types[i] == TT_LINKTEXTLOCAL ) ts = "local inlink text"; if ( types[i] == TT_LINKTEXTREMOTE ) ts = "remote inlink text"; if ( types[i] == TT_RSSITEMLOCAL ) ts = "local rss title"; if ( types[i] == TT_RSSITEMREMOTE ) ts = "remote rss title"; if ( types[i] == TT_BOLDTAG ) ts = "bold tag"; if ( types[i] == TT_HTAG ) ts = "header tag"; if ( types[i] == TT_TITLETAG ) ts = "title tag"; if ( types[i] == TT_FIRSTLINE ) ts = "first line in text"; if ( types[i] == TT_FONTTAG ) ts = "font tag"; if ( types[i] == TT_ATAG ) ts = "anchor tag"; if ( types[i] == TT_DIVTAG ) ts = "div tag"; if ( types[i] == TT_TDTAG ) ts = "td tag"; if ( types[i] == TT_PTAG ) ts = "p tag"; if ( types[i] == TT_URLPATH ) ts = "url path"; if ( types[i] == TT_TITLEATT ) ts = "title attribute"; // get the title pbuf->safePrintf( "<tr>" "<td>#%" PRId32"</td>" "<td><nobr>%s</nobr></td>" "<td>%" PRId32"</td>" "<td>%0.2f</td>" // baseScore "<td>%0.2f</td>" "<td>%0.2f</td>" "<td>%0.2f</td>" "<td>%0.2f</td>" "<td>", i, ts , parent[i], baseScore[i], noCapsBoost[i], qtermsBoost[i], inCommonCandBoost[i], scores[i]); // ptrs Words *w = cptrs[i]; int32_t a = as[i]; int32_t b = bs[i]; // skip if no words if ( w->getNumWords() <= 0 ) continue; // the word ptrs char **wptrs = w->getWordPtrs(); // string ptrs char *ptr = wptrs[a];//w->getWord(a); int32_t size = w->getStringSize(a,b); // it is utf8 pbuf->safeMemcpy ( ptr , size ); // end the line pbuf->safePrintf("</td></tr>\n"); } pbuf->safePrintf("</table>\n<br>\n"); // log these for now log("title: %s",sb.getBufStart()); */ return true; }
// . return length stored into "buf" // . content must be NULL terminated // . if "useAnchors" is true we do click and scroll // . if "isQueryTerms" is true, we do typical anchors in a special way int32_t Highlight::set ( SafeBuf *sb, //char *buf , //int32_t bufLen , char *content , int32_t contentLen , // primary language of the document (for synonyms) char docLangId , Query *q , bool doStemming , bool useAnchors , const char *baseUrl , const char *frontTag , const char *backTag , int32_t fieldCode , int32_t niceness ) { Words words; if ( ! words.set ( content , contentLen , TITLEREC_CURRENT_VERSION, true , // computeId true ) ) // has html entites? return -1; int32_t version = TITLEREC_CURRENT_VERSION; Bits bits; if ( ! bits.set (&words,version,niceness) ) return -1; Phrases phrases; if ( !phrases.set(&words,&bits,true,false,version,niceness))return -1; //SafeBuf langBuf; //if ( !setLangVec ( &words , &langBuf , niceness )) return 0; //uint8_t *langVec = (uint8_t *)langBuf.getBufStart(); // make synonyms //Synonyms syns; //if(!syns.set(&words,NULL,docLangId,&phrases,niceness,NULL)) return 0; Matches matches; matches.setQuery ( q ); if ( ! matches.addMatches ( &words , &phrases ) ) return -1; // store m_numMatches = matches.getNumMatches(); return set ( sb , //buf , //bufLen , &words , &matches , doStemming , useAnchors , baseUrl , frontTag , backTag , fieldCode , q ); }
int main(int ac, char* av[]) { cout << "InfinitePanorama. v0.2 -- (C) 2012 by Wilston Oreo." << endl; stringstream descStr; descStr << "Allowed options:" << endl; //descStr << "Panorama mode (interactive):" << endl; //descStr << "\tinfinitepanorama -P -l left.dat -r right.dat" << endl << endl; descStr << "Panorama mode (generate output file):" << endl; descStr << "\tinfinitepanorama -P -l left.dat -r right.dat -n 10 -h 1024 -o output.png" << endl << endl; descStr << "Database mode (generate database from directory):" << endl; descStr << "\tinfinitepanorama -D -i inputdir -l left.dat -r right.dat" << endl; descStr << "Database mode (generate database from filelist):" << endl; descStr << "\tinfinitepanorama -D -f filelist -l left.dat -r right.dat" << endl; descStr << "Preprocessing mode (generate database from filelist):" << endl; descStr << "\tinfinitepanorama -S -f filelist -i input.jpg -l left.dat -r right.dat" << endl; po::options_description desc(descStr.str()); string inputDir, fileList, databaseFileLeft, databaseFileRight; string outputImageFile, configFile; int width = 10240; int height = 768; int gistCount = 10000,histLargeCount = 100, histSmallCount = 0, thumbCount = 0; desc.add_options() ("help", "Display help message.") ("database,D", "Database mode") ("panorama,P", "Panorama mode") ("preprocess,S","Preprocessing mode") ("config,c", po::value<string>(&configFile), "Config file") ("inputdir,i", po::value<string>(&inputDir), "Input directory") ("filelist,f", po::value<string>(&fileList), "File list or data base list") ("left,l", po::value<string>(&databaseFileLeft), "Database for left part of images") ("right,r", po::value<string>(&databaseFileRight),"Database for right part of images") ("output,o", po::value<string>(&outputImageFile), "Output image file") ("width,w", po::value<int>(&width), "Panorama width. Default: 10240") ("height,h", po::value<int>(&height), "Panorama height. Default: 768") ("gist", po::value<int>(&gistCount), "Gist count") ("histlarge", po::value<int>(&histLargeCount), "Histogram (large) count") ("histsmall", po::value<int>(&histSmallCount), "Histogram (small) count") ("thumb", po::value<int>(&thumbCount), "Thumbnail count") ("linear", "Enable linear blending (default)") ("poisson", "Enable poisson blending") ; po::variables_map vm; po::store(po::parse_command_line(ac, av, desc), vm); po::notify(vm); #define V vm.count if (V("help")) { cout << desc << endl; return 1; } if (V("config")) loadConfig(configFile); if (V("gist")) config.set("FILTER_GIST_MATCHES",gistCount); if (V("histsmall")) config.set("FILTER_HISTSMALL_MATCHES",histSmallCount); if (V("histlarge")) config.set("FILTER_HISTLARGE_MATCHES",histLargeCount); if (V("thumb")) config.set("FILTER_THUMBNAIL_MATCHES",thumbCount); Panorama pan(width,height,&config); config.print(); Panorama::BlendingMode blend = Panorama::BLEND_LINEAR; if (V("poisson")) blend = Panorama::BLEND_POISSON; if (V("preprocess") && V("left") && V("right") && V("inputdir")) { Image image(inputDir); float border = 1.0f/6.0f; Descriptor leftDesc(image,int(image.columns()*border),image.rows(),0,0); Descriptor rightDesc(image,int(image.columns()*border),image.rows(),int((1.0f-border)*image.columns()),0); LOG->level(2); config.set("FILTER_GIST_MATCHES",5000); config.set("FILTER_HISTSMALL_MATCHES",0); config.set("FILTER_HISTLARGE_MATCHES",0); config.set("FILTER_THUMBNAIL_MATCHES",0); Database leftFinal(&config), rightFinal(&config); vector< left_right > files = loadDatabaseList(fileList); BOOST_FOREACH( left_right& file, files) { Statistics statistics(0,true); DescriptorFilter filter(&config,&statistics); LOG_MSG << fmt("Reading databases '%' (left) and '%' (right)...") % file.first % file.second; Database leftTmp(file.first), rightTmp(file.second); Descriptors leftDescs = leftTmp.descriptors(), rightDescs = rightTmp.descriptors(); Matches leftMatches = filter.getMatches(leftDesc,leftDescs); BOOST_FOREACH ( const Match& m, leftMatches) { if (m.desc) { statistics.exclude(leftDescs[m.desc->index()]); statistics.exclude(rightDescs[m.desc->index()]); } } Matches rightMatches = filter.getMatches(rightDesc,rightDescs); Matches matches = leftMatches; BOOST_FOREACH ( const Match& m, rightMatches ) matches.insert(m); BOOST_FOREACH ( const Match& m, matches) { if (m.desc) { Descriptor* newLeft = new Descriptor(leftTmp[m.desc->index()]); Descriptor* newRight = new Descriptor(rightTmp[m.desc->index()]); leftFinal.push_back(newLeft); rightFinal.push_back(newRight); } } LOG_MSG << fmt("Left database contains %, right database contains %") % leftFinal.size() % rightFinal.size(); leftTmp.clear(); rightTmp.clear(); }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State2 *st = (State2 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. //xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . this sets xd->m_oldTitleRec/m_oldTitleRecSize // . this sets xd->ptr_* and all other member vars from // the old title rec if found in titledb. if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // now force it to load old title rec //char **tr = xd->getTitleRec(); SafeBuf *tr = xd->getTitleRecBuf(); // blocked? return false if so. it will call processLoop() when it rets if ( tr == (void *)-1 ) return false; // we did not block. check for error? this will free "st" too. if ( ! tr ) return sendErrorReply ( st , g_errno ); // if title rec was empty, that is a problem if ( xd->m_titleRecBuf.length() == 0 ) return sendErrorReply ( st , ENOTFOUND); // set callback char *na = xd->getIsNoArchive(); // wait if blocked if ( na == (void *)-1 ) return false; // error? if ( ! na ) return sendErrorReply ( st , g_errno ); // forbidden? allow turkeys through though... if ( ! st->m_isAdmin && *na ) return sendErrorReply ( st , ENOCACHE ); SafeBuf *sb = &st->m_sb; // &page=4 will print rainbow sections if ( ! st->m_printed && st->m_r.getLong("page",0) ) { // do not repeat this call st->m_printed = true; // this will call us again since we called // xd->setCallback() above to us if ( ! xd->printDocForProCog ( sb , &st->m_r ) ) return false; } char *contentType = "text/html"; char format = st->m_format; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // if we printed a special page (like rainbow sections) then return now if ( st->m_printed ) { bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, //"text/html", contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); return status; } /* // this was calling XmlDoc and setting sections, etc. to // get the SpiderReply junk... no no no // is it banned or filtered? this ignores the TagRec in the titleRec // and uses msg8a to get it fresh instead char *vi = xd->getIsFiltered();//Visible( ); // wait if blocked if ( vi == (void *)-1 ) return false; // error? if ( ! vi ) return sendErrorReply ( st , g_errno ); // banned? if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED); */ // get the utf8 content char **utf8 = xd->getUtf8Content(); //long len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) { log("pageget: utf8 content <= 0"); return sendErrorReply(st,EBADENGINEER ); } // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) { log("pageget: hostid %li is bad",g_hostdb.m_hostId); return sendErrorReply(st,EBADENGINEER ); } char *content = xd->ptr_utf8Content; long contentLen = xd->size_utf8Content - 1; // shortcut char strip = st->m_strip; // alloc buffer now //char *buf = NULL; //long bufMaxSize = 0; //bufMaxSize = len + ( 32 * 1024 ) ; //bufMaxSize = contentLen + ( 32 * 1024 ) ; //buf = (char *)mmalloc ( bufMaxSize , "PageGet2" ); //char *p = buf; //char *bufEnd = buf + bufMaxSize; //if ( ! buf ) { // return sendErrorReply ( st , g_errno ); //} // for undoing the header //char *start1 = p; long startLen1 = sb->length(); // we are always utfu if ( strip != 2 ) sb->safePrintf( "<meta http-equiv=\"Content-Type\" " "content=\"text/html;charset=utf8\">\n"); // base href //Url *base = &xd->m_firstUrl; //if ( xd->ptr_redirUrl.m_url[0] ) // base = &xd->m_redirUrl; char *base = xd->ptr_firstUrl; if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl; //Url *redir = *xd->getRedirUrl(); if ( strip != 2 ) { sb->safePrintf ( "<BASE HREF=\"%s\">" , base ); //p += gbstrlen ( p ); } // default colors in case css files missing if ( strip != 2 ) { sb->safePrintf( "\n<style type=\"text/css\">\n" "body{background-color:white;color:black;}\n" "</style>\n"); //p += gbstrlen ( p ); } //char format = st->m_format; if ( format == FORMAT_XML ) sb->reset(); if ( format == FORMAT_JSON ) sb->reset(); // for undoing the stuff below long startLen2 = sb->length();//p; // query should be NULL terminated char *q = st->m_q; long qlen = st->m_qlen; char styleTitle[128] = "font-size:14px;font-weight:600;" "color:#000000;"; char styleText[128] = "font-size:14px;font-weight:400;" "color:#000000;"; char styleLink[128] = "font-size:14px;font-weight:400;" "color:#0000ff;"; char styleTell[128] = "font-size:14px;font-weight:600;" "color:#cc0000;"; // get the url of the title rec Url *f = xd->getFirstUrl(); bool printDisclaimer = st->m_printDisclaimer; if ( xd->m_contentType == CT_JSON ) printDisclaimer = false; if ( format == FORMAT_XML ) printDisclaimer = false; if ( format == FORMAT_JSON ) printDisclaimer = false; char tbuf[100]; tbuf[0] = 0; time_t lastSpiderDate = xd->m_spideredTime; if ( printDisclaimer || format == FORMAT_XML || format == FORMAT_JSON ) { struct tm *timeStruct = gmtime ( &lastSpiderDate ); strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); } // We should always be displaying this disclaimer. // - May eventually want to display this at a different location // on the page, or on the click 'n' scroll browser page itself // when this page is not being viewed solo. // CNS: if ( ! st->m_clickNScroll ) { if ( printDisclaimer ) { sb->safePrintf(//sprintf ( p , //"<BASE HREF=\"%s\">" //"<table border=1 width=100%%>" //"<tr><td>" "<table border=\"1\" bgcolor=\"#" BGCOLOR "\" cellpadding=\"10\" " //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\"" "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">" "<tr" //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\"" "><td>" //"<font face=times,sans-serif color=black size=-1>" "<span style=\"%s\">" "This is Gigablast's cached page of </span>" "<a href=\"%s\" style=\"%s\">%s</a>" "" , styleTitle, f->getUrl(), styleLink, f->getUrl() ); //p += gbstrlen ( p ); // then the rest //sprintf(p , sb->safePrintf( "<span style=\"%s\">. " "Gigablast is not responsible for the content of " "this page.</span>", styleTitle ); //p += gbstrlen ( p ); sb->safePrintf ( "<br/><span style=\"%s\">" "Cached: </span>" "<span style=\"%s\">", styleTitle, styleText ); //p += gbstrlen ( p ); // then the spider date in GMT // time_t lastSpiderDate = xd->m_spideredTime; // struct tm *timeStruct = gmtime ( &lastSpiderDate ); // char tbuf[100]; // strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); //p += gbstrlen ( p ); sb->safeStrcpy(tbuf); // Moved over from PageResults.cpp sb->safePrintf( "</span> - <a href=\"" "/get?" "q=%s&c=%s&rtq=%li&" "d=%lli&strip=1\"" " style=\"%s\">" "[stripped]</a>", q , st->m_coll , (long)st->m_rtq, st->m_docId, styleLink ); // a link to alexa if ( f->getUrlLen() > 5 ) { sb->safePrintf( " - <a href=\"http:" "//web.archive.org/web/*/%s\"" " style=\"%s\">" "[older copies]</a>" , f->getUrl(), styleLink ); } if (st->m_noArchive){ sb->safePrintf( " - <span style=\"%s\"><b>" "[NOARCHIVE]</b></span>", styleTell ); } if (st->m_isBanned){ sb->safePrintf(" - <span style=\"%s\"><b>" "[BANNED]</b></span>", styleTell ); } // only print this if we got a query if ( qlen > 0 ) { sb->safePrintf("<br/><br/><span style=\"%s\"> " "These search terms have been " "highlighted: ", styleText ); //p += gbstrlen ( p ); } } // how much space left in p? //long avail = bufEnd - p; // . make the url that we're outputting for (like in PageResults.cpp) // . "thisUrl" is the baseUrl for click & scroll char thisUrl[MAX_URL_LEN]; char *thisUrlEnd = thisUrl + MAX_URL_LEN; char *x = thisUrl; // . use the external ip of our gateway // . construct the NAT mapped port // . you should have used iptables to map port to the correct // internal ip:port //unsigned long ip =g_conf.m_mainExternalIp ; // h->m_externalIp; //unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort // local check //if ( st->m_isLocal ) { unsigned long ip = h->m_ip; unsigned short port = h->m_httpPort; //} //sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port ); // . we no longer put the port in here // . but still need http:// since we use <base href=> if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip)); else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port); x += gbstrlen ( x ); // the query url encoded long elen = urlEncode ( x , thisUrlEnd - x , q , qlen ); x += elen; // separate cgi vars with a & //sprintf ( x, "&seq=%li&rtq=%lid=%lli", // (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId()); sprintf ( x, "&d=%lli",st->m_docId ); x += gbstrlen(x); // set our query for highlighting Query qq; qq.set2 ( q, st->m_langId , true ); // print the query terms into our highlight buffer Highlight hi; // make words so we can set the scores to ignore fielded terms Words qw; qw.set ( q , // content being highlighted, utf8 qlen , // content being highlighted, utf8 TITLEREC_CURRENT_VERSION, true , // computeIds false ); // hasHtmlEntities? // . assign scores of 0 to query words that should be ignored // . TRICKY: loop over words in qq.m_qwords, but they should be 1-1 // with words in qw. // . sanity check //if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;} // declare up here Matches m; // do the loop //Scores ss; //ss.set ( &qw , NULL ); //for ( long i = 0 ; i < qq.m_numWords ; i++ ) // if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0; // now set m.m_matches[] to those words in qw that match a query word // or phrase in qq. m.setQuery ( &qq ); //m.addMatches ( &qw , &ss , true ); m.addMatches ( &qw ); long hilen = 0; // CNS: if ( ! st->m_clickNScroll ) { // and highlight the matches if ( printDisclaimer ) { hilen = hi.set ( //p , //avail , sb , &qw , // words to highlight &m , // matches relative to qw false , // doSteming false , // st->m_clickAndScroll , (char *)thisUrl );// base url for ClcknScrll //p += hilen; // now an hr //memcpy ( p , "</span></table></table>\n" , 24 ); p += 24; sb->safeStrcpy("</span></table></table>\n"); } bool includeHeader = st->m_includeHeader; // do not show header for json object display if ( xd->m_contentType == CT_JSON ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false; //mfree(uq, uqCapacity, "PageGet"); // undo the header writes if we should if ( ! includeHeader ) { // including base href is off by default when not including // the header, so the caller must explicitly turn it back on if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2; else sb->m_length=startLen1;//p=start1; } //sb->safeStrcpy(tbuf); if ( format == FORMAT_XML ) { sb->safePrintf("<response>\n"); sb->safePrintf("<statusCode>0</statusCode>\n"); sb->safePrintf("<statusMsg>Success</statusMsg>\n"); sb->safePrintf("<url><![CDATA["); sb->cdataEncode(xd->m_firstUrl.m_url); sb->safePrintf("]]></url>\n"); sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId); sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n", lastSpiderDate); sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf); } if ( format == FORMAT_JSON ) { sb->safePrintf("{\"response\":{\n"); sb->safePrintf("\t\"statusCode\":0,\n"); sb->safePrintf("\t\"statusMsg\":\"Success\",\n"); sb->safePrintf("\t\"url\":\""); sb->jsonEncode(xd->m_firstUrl.m_url); sb->safePrintf("\",\n"); sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId); sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate); sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf); } // identify start of <title> tag we wrote out char *sbstart = sb->getBufStart(); char *sbend = sb->getBufEnd(); char *titleStart = NULL; char *titleEnd = NULL; for ( char *t = sbstart ; t < sbend ; t++ ) { // title tag? if ( t[0]!='<' ) continue; if ( to_lower_a(t[1])!='t' ) continue; if ( to_lower_a(t[2])!='i' ) continue; if ( to_lower_a(t[3])!='t' ) continue; if ( to_lower_a(t[4])!='l' ) continue; if ( to_lower_a(t[5])!='e' ) continue; // point to it char *x = t + 5; // max - to keep things fast char *max = x + 500; for ( ; *x && *x != '>' && x < max ; x++ ); x++; // find end char *e = x; for ( ; *e && e < max ; e++ ) { if ( e[0]=='<' && to_lower_a(e[1])=='/' && to_lower_a(e[2])=='t' && to_lower_a(e[3])=='i' && to_lower_a(e[4])=='t' && to_lower_a(e[5])=='l' && to_lower_a(e[6])=='e' ) break; } if ( e < max ) { titleStart = x; titleEnd = e; } break; } // . print title at top! // . consider moving if ( titleStart ) { char *ebuf = st->m_r.getString("eb"); if ( ! ebuf ) ebuf = ""; //p += sprintf ( p , sb->safePrintf( "<table border=1 " "cellpadding=10 " "cellspacing=0 " "width=100%% " "color=#ffffff>" ); long printLinks = st->m_r.getLong("links",0); if ( ! printDisclaimer && printLinks ) sb->safePrintf(//p += sprintf ( p , // first put cached and live link "<tr>" "<td bgcolor=lightyellow>" // print cached link //"<center>" " " "<b>" "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=\"" "/get?" "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">" "cached link</a>" " " "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=%s>live link</a>" "</b>" //"</center>" "</td>" "</tr>\n" ,st->m_coll ,st->m_docId ,ebuf ,thisUrl // st->ptr_ubuf ); if ( printLinks ) { sb->safePrintf(//p += sprintf ( p , "<tr><td bgcolor=pink>" "<span style=\"font-size:18px;" "font-weight:600;" "color:#000000;\">" " " "<b>PAGE TITLE:</b> " ); long tlen = titleEnd - titleStart; sb->safeMemcpy ( titleStart , tlen ); sb->safePrintf ( "</span></td></tr>" ); } sb->safePrintf( "</table><br>\n" ); } // is the content preformatted? bool pre = false; char ctype = (char)xd->m_contentType; if ( ctype == CT_TEXT ) pre = true ; // text/plain if ( ctype == CT_DOC ) pre = true ; // filtered msword if ( ctype == CT_PS ) pre = true ; // filtered postscript if ( format == FORMAT_XML ) pre = false; if ( format == FORMAT_JSON ) pre = false; // if it is content-type text, add a <pre> if ( pre ) {//p + 5 < bufEnd && pre ) { sb->safePrintf("<pre>"); //p += 5; } if ( st->m_strip == 1 ) contentLen = stripHtml( content, contentLen, (long)xd->m_version, st->m_strip ); // it returns -1 and sets g_errno on error, line OOM if ( contentLen == -1 ) { //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } Xml xml; Words ww; // if no highlighting, skip it bool queryHighlighting = st->m_queryHighlighting; if ( st->m_strip == 2 ) queryHighlighting = false; // do not do term highlighting if json if ( xd->m_contentType == CT_JSON ) queryHighlighting = false; SafeBuf tmp; SafeBuf *xb = sb; if ( format == FORMAT_XML ) xb = &tmp; if ( format == FORMAT_JSON ) xb = &tmp; if ( ! queryHighlighting ) { xb->safeMemcpy ( content , contentLen ); //p += contentLen ; } else { // get the content as xhtml (should be NULL terminated) //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // sanity check //if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } // how much space left in p? //avail = bufEnd - p; Matches m; m.setQuery ( &qq ); m.addMatches ( &ww ); hilen = hi.set ( xb , // p , avail , &ww , &m , false /*doStemming?*/ , st->m_clickAndScroll , thisUrl /*base url for click & scroll*/); //p += hilen; log(LOG_DEBUG, "query: Done highlighting cached page content"); } if ( format == FORMAT_XML ) { sb->safePrintf("\t<content><![CDATA["); sb->cdataEncode ( xb->getBufStart() ); sb->safePrintf("]]></content>\n"); sb->safePrintf("</response>\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"content\":\"\n"); sb->jsonEncode ( xb->getBufStart() ); sb->safePrintf("\"\n}\n}\n"); } // if it is content-type text, add a </pre> if ( pre ) { // p + 6 < bufEnd && pre ) { sb->safeMemcpy ( "</pre>" , 6 ); //p += 6; } // calculate bufLen //long bufLen = p - buf; long ct = xd->m_contentType; // now filter the entire buffer to escape out the xml tags // so it is displayed nice SafeBuf newbuf; if ( ct == CT_XML ) { // encode the xml tags into <tagname> sequences if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() , sb->getLength(), 0)){// niceness=0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // reassign //buf = newbuf.getBufStart(); //bufLen = newbuf.length(); sb->stealBuf ( &newbuf ); } // now encapsulate it in html head/tail and send it off // sendErr: contentType = "text/html"; if ( strip == 2 ) contentType = "text/xml"; // xml is usually buggy and this throws browser off //if ( ctype == CT_XML ) contentType = "text/xml"; if ( xd->m_contentType == CT_JSON ) contentType = "application/json"; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // safebuf, sb, is a member of "st" so this should copy the buffer // when it constructs the http reply, and we gotta call delete(st) // AFTER this so sb is still valid. bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( ct == CT_XML ) newbuf.purge(); //else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // and convey the status return status; }
int main() { cv::VideoCapture cap("../data/guitar_scene.mp4"); if(!cap.isOpened()) { cerr << "Camera/Video was not opened\n"; return 1; } //SystemAlgorithms algorithms(new SIFTDetector(500), new SIFTExtractor, new BruteForceMatcher(cv::NORM_L2), new LucasKanadeAlgorithm); SystemAlgorithms algorithms = SystemAlgorithms::Create(false, true); HybridTracker tracker(algorithms); Marker target = tracker.Registry(PreMarker("../data/guitar_object.jpg", nullptr)); Size2i targetSize = target.GetSize(); vector<cv::Point2f> targetCorner(4); targetCorner[0] = cv::Point2f(0.0, 0.0); targetCorner[3] = cv::Point2f(0.0, targetSize.height); targetCorner[2] = cv::Point2f(targetSize.width, targetSize.height); targetCorner[1] = cv::Point2f(targetSize.width, 0.0); uint numFrames = 0; double time = (double) cv::getTickCount(); Frame frame; Matches matches; while(true) { cap >> frame.image; if(frame.image.empty() and numFrames == 0) continue; if(frame.image.empty()) break; numFrames++; tracker.Update(frame); matches = tracker.Find(target, frame); if(matches.size() > 20) target.SetLost(false); else target.SetLost(true); if(matches.size() >= 4) { vector<Point2f> corners; Mat homography = cv::findHomography(matches.targetPts(), matches.scenePts(), cv::RANSAC, 4); cv::perspectiveTransform(targetCorner, corners, homography); if(!corners.empty()) { cv::line(frame.image, corners[0], corners[1], cv::Scalar(0, 255, 0), 4); cv::line(frame.image, corners[1], corners[2], cv::Scalar(0, 255, 0), 4); cv::line(frame.image, corners[2], corners[3], cv::Scalar(0, 255, 0), 4); cv::line(frame.image, corners[3], corners[0], cv::Scalar(0, 255, 0), 4); } } for(auto p : matches.scenePts()) { cv::circle(frame.image, p, 3, cv::Scalar(0, 255, 0), 1); } cv::imshow("Tracker Test", frame.image); char key = cv::waitKey(1); if(key == 0x1B) // ESC break; } time = (double)(cv::getTickCount() - time) / cv::getTickFrequency(); cout << (numFrames/time) << " fps\n"; return 0; }