コード例 #1
0
void CDDBSQLUpdater::import(const std::string& importfile, bool initial_import)
{
    m_rep.clear();
    
    Duration duration;

    // construct an untar object and tell it to use bz2 when the file has the
    // .bz2 suffix (it should always have..)
    UnTar tar(importfile, (importfile.rfind(".bz2") == importfile.length() - 4));

    m_sql.exec("PRAGMA synchronous=OFF");
    m_sql.exec("PRAGMA count_changes=OFF");
    m_sql.exec("PRAGMA journal_mode=MEMORY");
    m_sql.exec("PRAGMA temp_store=MEMORY");

    m_sql.exec("BEGIN TRANSACTION");

    if (initial_import) {
        m_sql.exec("DROP INDEX fuzzyid_id_idx");
    }

    UnTar::buf_t data;

    // get file after file
    while (tar.entry(data, TarHeader::File, true) != TarHeader::Unknown) {

        if (m_rep.rct && m_rep.rct % 100000 == 0) {
            duration.lap();
            std::cout << fmt::format("{0} - records read: {1}, rps: {2}",
                                     duration.to_string(Duration::Precision::Seconds),
                                     m_rep.rct,
                                     (100000*1000) / (duration.get_lap(Duration::Precision::Milliseconds)))
            << std::endl;
        }

        ++m_rep.rct;
        m_rep.bct += data.size();

        // following here is handling of normal files

        // construct DiskRecord from the data
        DiskRecord rec(data);

        // check if the record contains plausible data
        if (!rec.valid()) {

            if (m_debug) {
                std::string exterr = rec.artist() + " / " + rec.title();
                error("INVALID", exterr, data);
            }

            ++m_rep.frct;
            continue;
        }

        bool record_written = false;

        uint32_t cdid = check_title_hash(rec.normalized_hash());

        if (!cdid) {

            // this is a new record, write it
            cdid = write_record(rec, false);
            record_written = true;

        } else {

            ++m_rep.dcrcct;

            if (m_debug) {
                // this CD CRC is already known. For debug purposes, let's store them
                // to find out if they are legitimately so, or CRC collisions
                // (investigations showed they are legitimate dupes, but with differing discids due to
                // slightly different track offsets..)
                std::string exterr = fmt::format("hash duplicate: {0}", rec.normalized_hash());
                error("HASHDUP", exterr, data);
            }

            // on purpose, fall through to writing the discid links -
            // all needed data is valid: the cdid, and the rec.discid() is actually a new
            // valid discid for that already known cdid
            
        }
        
        // now write the discid link(s)
        {

            bool discid_valid = true;

            uint32_t ecd = check_discid(rec.discid());

            if (ecd) {

                // trouble - the discid is already known
                //
                // check if it is some sort of a collision
                //  a. hash collision, where different frame lengths yield the same hash value
                //  b. "real world" collision, where different CDs yield the same frame lengths
                //      (in this case we can not do a lot to resolve it automatically)
                //  c. actually same discid pointing to the same CD, which simply means that we
                //      had undetected dupes in the original CDDB database
                //
                // Case a. concerns about one record in 2800 with FNV hash computation on frames
                //      (which is really good, the original discid algorithm has a collision of
                //      one record in 3 (which renders it unusable if it were not checking for
                //      the frame lengths after fetching all the duplicate CDID records).
                // Case b. concerns about one in 146 records, in which the user needs to pick
                //      the right disc.
                // Case c. is the most frequent one (about one in 23 records). In most cases,
                // those are duplicates due to improved text content (like accents in titles, etc.)
                //
                // For case c. we should then check if the new version is preferrable over the existing
                // version (higher revision, or if equal revision higher entropy value), and update if it is.

                // for now - we check later down if we still want to write it
                discid_valid  = false;

                // now check if this really is a collision, that is, the
                // track sequences of the existing cd are different

                DiskRecord existing_rec = read_record(ecd, rec.discid());

                bool same_frames = existing_rec.seconds() == rec.seconds() && existing_rec.frames() == rec.frames();

                // now check if this is actually the same CD (by comparing the disc artist and title)
                bool same_title = (DiskRecord::compare_normalized(existing_rec.artist() + existing_rec.title(), rec.artist() + rec.title()) >= 25
                             || DiskRecord::compare_normalized(existing_rec.artist(), rec.artist()) >= 25
                             || DiskRecord::compare_normalized(existing_rec.title(), rec.title()) >= 25);

                if (!same_frames) {

                    // write the discid link to the CD. It is a collision, the user will have to pick the right choice.
                    discid_valid = true;

                    if (same_title) ++m_rep.realcddidcollct;
                    else ++m_rep.realdidcollct;

                    if (m_debug) {
                        std::string exterr = fmt::format("discid {0}, cd {1}, {2} / {3} - {4} / {5}",
                                                         rec.discid(), ecd, rec.artist(), rec.title(),
                                                         existing_rec.artist(), existing_rec.title());
                        if (same_title) error("SAMECDDID", exterr, data);
                        else error("SAMEDID", exterr, data);
                    }

                } else {

                    // same frames ->

                    std::string add_reason;

                    if (same_title) {

                        bool update_with_this = false;

                        ++m_rep.samecdframesct;
                        add_reason += "_REQ";

                        // now compare entropy - higher entropy is an indicator for more information and more
                        // accurate code points (think of accented chars vs. ASCII)
                        
                        if (rec.entropy() > existing_rec.entropy()) {

                            ++m_rep.entropy_gt;
                            add_reason += "_EGT";
                            // update the existing record with this one, and remove the record if we had written one
                            update_with_this = true;

                        }
                        else if (rec.entropy() == existing_rec.entropy()) {

                            // now check if the strings are EXACTLY the same
                            if (rec.equal_strings(existing_rec)) {

                                ++m_rep.duplicate;
                                add_reason += "_DUP";
                                // skip this, and remove the record if we had written one (not very probable)

                            } else if (rec.equal_lowercase_strings(existing_rec)) {

                                ++m_rep.duplicate_lower;
                                add_reason += "_DLP";

                                // check which of the strings contains more uppercase characters (which, if they
                                // are not all uppercase, is normally an indication of a more accurate record)

                                if (rec.charcount_upper() > existing_rec.charcount_upper()) {
                                    ++m_rep.upper_count_gt;
                                    // update the existing record with this one, and remove the record if we had written one
                                    update_with_this = true;
                                } else {
                                    ++m_rep.upper_count_eqlt;
                                }

                            } else {

                                // now check which one contains more characters (which we take as an indication
                                // of more complete information)
                                
                                if (rec.charcount() > existing_rec.charcount()) {
                                    ++m_rep.overall_count_gt;
                                    // update the existing record with this one, and remove the record if we had written one
                                    update_with_this = true;
                                } else {
                                    ++m_rep.overall_count_eqlt;
                                }

                                ++m_rep.entropy_eq;
                                add_reason += "_EEQ";

                            }

                        }
                        else {

                            ++m_rep.entropy_lt;
                            add_reason += "_ELT";
                            // skip this, and remove the record if we had written one (not very probable)

                        }

                        if (record_written) delete_record(cdid, rec.normalized_hash());
                        if (update_with_this) update_record(ecd, rec);

                    } else {
                        ++m_rep.sameframesct;
                    }

                    if (m_debug) {
                        std::string exterr = fmt::format("discid {0}, cd {1}, {2} / {3} - {4} / {5}",
                                                         rec.discid(), ecd, rec.artist(), rec.title(),
                                                         existing_rec.artist(), existing_rec.title());
                        if (same_title) error(std::string("SAMECDFRAMES") + add_reason, exterr, data); // these are duplicate CD titles (well, they vary slightly, but mean the same CD)
                        else error("SAMEFRAMES", exterr, data); // these are really same frames, but not same CDs
                    }
                    
                }

            }

            if (discid_valid) {
                write_discid(rec.discid(), cdid);
                write_fuzzy_discid(rec.fuzzy_discid(), cdid);
            }
        }
    }

    duration.lap();
    std::cout << fmt::format("{0} - records read: {1}, rps: {2}",
                             duration.to_string(Duration::Precision::Seconds),
                             m_rep.rct,
                             (m_rep.rct*1000) / (duration.get(Duration::Precision::Milliseconds)))
        << std::endl;

    std::cout << m_rep.to_string();

    if (initial_import) {
        Duration idxduration;
        m_sql.exec("CREATE INDEX fuzzyid_id_idx ON FUZZYID (fuzzyid)");
        idxduration.lap();
        std::cout << fmt::format("index creation took {0}", idxduration.to_string(Duration::Precision::Milliseconds)) << std::endl;
    }

    m_sql.exec("COMMIT TRANSACTION");

    duration.lap();

    std::cout << fmt::format("total time used: {0}", duration.to_string(Duration::Precision::Milliseconds)) << std::endl;
}