void CDDBSQLUpdater::import(const std::string& importfile, bool initial_import) { m_rep.clear(); Duration duration; // construct an untar object and tell it to use bz2 when the file has the // .bz2 suffix (it should always have..) UnTar tar(importfile, (importfile.rfind(".bz2") == importfile.length() - 4)); m_sql.exec("PRAGMA synchronous=OFF"); m_sql.exec("PRAGMA count_changes=OFF"); m_sql.exec("PRAGMA journal_mode=MEMORY"); m_sql.exec("PRAGMA temp_store=MEMORY"); m_sql.exec("BEGIN TRANSACTION"); if (initial_import) { m_sql.exec("DROP INDEX fuzzyid_id_idx"); } UnTar::buf_t data; // get file after file while (tar.entry(data, TarHeader::File, true) != TarHeader::Unknown) { if (m_rep.rct && m_rep.rct % 100000 == 0) { duration.lap(); std::cout << fmt::format("{0} - records read: {1}, rps: {2}", duration.to_string(Duration::Precision::Seconds), m_rep.rct, (100000*1000) / (duration.get_lap(Duration::Precision::Milliseconds))) << std::endl; } ++m_rep.rct; m_rep.bct += data.size(); // following here is handling of normal files // construct DiskRecord from the data DiskRecord rec(data); // check if the record contains plausible data if (!rec.valid()) { if (m_debug) { std::string exterr = rec.artist() + " / " + rec.title(); error("INVALID", exterr, data); } ++m_rep.frct; continue; } bool record_written = false; uint32_t cdid = check_title_hash(rec.normalized_hash()); if (!cdid) { // this is a new record, write it cdid = write_record(rec, false); record_written = true; } else { ++m_rep.dcrcct; if (m_debug) { // this CD CRC is already known. For debug purposes, let's store them // to find out if they are legitimately so, or CRC collisions // (investigations showed they are legitimate dupes, but with differing discids due to // slightly different track offsets..) std::string exterr = fmt::format("hash duplicate: {0}", rec.normalized_hash()); error("HASHDUP", exterr, data); } // on purpose, fall through to writing the discid links - // all needed data is valid: the cdid, and the rec.discid() is actually a new // valid discid for that already known cdid } // now write the discid link(s) { bool discid_valid = true; uint32_t ecd = check_discid(rec.discid()); if (ecd) { // trouble - the discid is already known // // check if it is some sort of a collision // a. hash collision, where different frame lengths yield the same hash value // b. "real world" collision, where different CDs yield the same frame lengths // (in this case we can not do a lot to resolve it automatically) // c. actually same discid pointing to the same CD, which simply means that we // had undetected dupes in the original CDDB database // // Case a. concerns about one record in 2800 with FNV hash computation on frames // (which is really good, the original discid algorithm has a collision of // one record in 3 (which renders it unusable if it were not checking for // the frame lengths after fetching all the duplicate CDID records). // Case b. concerns about one in 146 records, in which the user needs to pick // the right disc. // Case c. is the most frequent one (about one in 23 records). In most cases, // those are duplicates due to improved text content (like accents in titles, etc.) // // For case c. we should then check if the new version is preferrable over the existing // version (higher revision, or if equal revision higher entropy value), and update if it is. // for now - we check later down if we still want to write it discid_valid = false; // now check if this really is a collision, that is, the // track sequences of the existing cd are different DiskRecord existing_rec = read_record(ecd, rec.discid()); bool same_frames = existing_rec.seconds() == rec.seconds() && existing_rec.frames() == rec.frames(); // now check if this is actually the same CD (by comparing the disc artist and title) bool same_title = (DiskRecord::compare_normalized(existing_rec.artist() + existing_rec.title(), rec.artist() + rec.title()) >= 25 || DiskRecord::compare_normalized(existing_rec.artist(), rec.artist()) >= 25 || DiskRecord::compare_normalized(existing_rec.title(), rec.title()) >= 25); if (!same_frames) { // write the discid link to the CD. It is a collision, the user will have to pick the right choice. discid_valid = true; if (same_title) ++m_rep.realcddidcollct; else ++m_rep.realdidcollct; if (m_debug) { std::string exterr = fmt::format("discid {0}, cd {1}, {2} / {3} - {4} / {5}", rec.discid(), ecd, rec.artist(), rec.title(), existing_rec.artist(), existing_rec.title()); if (same_title) error("SAMECDDID", exterr, data); else error("SAMEDID", exterr, data); } } else { // same frames -> std::string add_reason; if (same_title) { bool update_with_this = false; ++m_rep.samecdframesct; add_reason += "_REQ"; // now compare entropy - higher entropy is an indicator for more information and more // accurate code points (think of accented chars vs. ASCII) if (rec.entropy() > existing_rec.entropy()) { ++m_rep.entropy_gt; add_reason += "_EGT"; // update the existing record with this one, and remove the record if we had written one update_with_this = true; } else if (rec.entropy() == existing_rec.entropy()) { // now check if the strings are EXACTLY the same if (rec.equal_strings(existing_rec)) { ++m_rep.duplicate; add_reason += "_DUP"; // skip this, and remove the record if we had written one (not very probable) } else if (rec.equal_lowercase_strings(existing_rec)) { ++m_rep.duplicate_lower; add_reason += "_DLP"; // check which of the strings contains more uppercase characters (which, if they // are not all uppercase, is normally an indication of a more accurate record) if (rec.charcount_upper() > existing_rec.charcount_upper()) { ++m_rep.upper_count_gt; // update the existing record with this one, and remove the record if we had written one update_with_this = true; } else { ++m_rep.upper_count_eqlt; } } else { // now check which one contains more characters (which we take as an indication // of more complete information) if (rec.charcount() > existing_rec.charcount()) { ++m_rep.overall_count_gt; // update the existing record with this one, and remove the record if we had written one update_with_this = true; } else { ++m_rep.overall_count_eqlt; } ++m_rep.entropy_eq; add_reason += "_EEQ"; } } else { ++m_rep.entropy_lt; add_reason += "_ELT"; // skip this, and remove the record if we had written one (not very probable) } if (record_written) delete_record(cdid, rec.normalized_hash()); if (update_with_this) update_record(ecd, rec); } else { ++m_rep.sameframesct; } if (m_debug) { std::string exterr = fmt::format("discid {0}, cd {1}, {2} / {3} - {4} / {5}", rec.discid(), ecd, rec.artist(), rec.title(), existing_rec.artist(), existing_rec.title()); if (same_title) error(std::string("SAMECDFRAMES") + add_reason, exterr, data); // these are duplicate CD titles (well, they vary slightly, but mean the same CD) else error("SAMEFRAMES", exterr, data); // these are really same frames, but not same CDs } } } if (discid_valid) { write_discid(rec.discid(), cdid); write_fuzzy_discid(rec.fuzzy_discid(), cdid); } } } duration.lap(); std::cout << fmt::format("{0} - records read: {1}, rps: {2}", duration.to_string(Duration::Precision::Seconds), m_rep.rct, (m_rep.rct*1000) / (duration.get(Duration::Precision::Milliseconds))) << std::endl; std::cout << m_rep.to_string(); if (initial_import) { Duration idxduration; m_sql.exec("CREATE INDEX fuzzyid_id_idx ON FUZZYID (fuzzyid)"); idxduration.lap(); std::cout << fmt::format("index creation took {0}", idxduration.to_string(Duration::Precision::Milliseconds)) << std::endl; } m_sql.exec("COMMIT TRANSACTION"); duration.lap(); std::cout << fmt::format("total time used: {0}", duration.to_string(Duration::Precision::Milliseconds)) << std::endl; }