/* construct an initial header for a record with no ltdb header yet */ static void ltdb_initial_header(struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header) { ZERO_STRUCTP(header); /* initial dmaster is the lmaster */ header->dmaster = ctdb_lmaster(ctdb_db->ctdb, &key); header->laccessor = header->dmaster; }
/* construct an initial header for a record with no ltdb header yet */ static void ltdb_initial_header(struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header) { ZERO_STRUCTP(header); /* initial dmaster is the lmaster */ header->dmaster = ctdb_lmaster(ctdb_db->ctdb, &key); header->flags = CTDB_REC_FLAG_AUTOMATIC; }
/* called when a CTDB_REPLY_REDIRECT packet comes in This packet arrives when we have sent a CTDB_REQ_CALL request and the node that received it is not the dmaster for the given key. We are given a hint as to what node to try next. */ void ctdb_reply_redirect(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) { struct ctdb_reply_redirect *c = (struct ctdb_reply_redirect *)hdr; struct ctdb_call_state *state; state = idr_find_type(ctdb->idr, hdr->reqid, struct ctdb_call_state); if (state == NULL) return; talloc_steal(state, c); /* don't allow for too many redirects */ if (state->redirect_count++ == CTDB_MAX_REDIRECT) { c->dmaster = ctdb_lmaster(ctdb, &state->call.key); } /* send it off again */ state->node = ctdb->nodes[c->dmaster]; state->c->hdr.destnode = c->dmaster; ctdb_queue_packet(ctdb, &state->c->hdr); }
/* send a dmaster request (give another node the dmaster for a record) This is always sent to the lmaster, which ensures that the lmaster always knows who the dmaster is. The lmaster will then send a CTDB_REPLY_DMASTER to the new dmaster */ static void ctdb_call_send_dmaster(struct ctdb_db_context *ctdb_db, struct ctdb_req_call *c, struct ctdb_ltdb_header *header, TDB_DATA *key, TDB_DATA *data) { struct ctdb_req_dmaster *r; struct ctdb_context *ctdb = ctdb_db->ctdb; int len; len = offsetof(struct ctdb_req_dmaster, data) + key->dsize + data->dsize; r = ctdb->methods->allocate_pkt(ctdb, len); CTDB_NO_MEMORY_FATAL(ctdb, r); talloc_set_name_const(r, "send_dmaster packet"); r->hdr.length = len; r->hdr.ctdb_magic = CTDB_MAGIC; r->hdr.ctdb_version = CTDB_VERSION; r->hdr.operation = CTDB_REQ_DMASTER; r->hdr.destnode = ctdb_lmaster(ctdb, key); r->hdr.srcnode = ctdb->vnn; r->hdr.reqid = c->hdr.reqid; r->db_id = c->db_id; r->dmaster = c->hdr.srcnode; r->keylen = key->dsize; r->datalen = data->dsize; memcpy(&r->data[0], key->dptr, key->dsize); memcpy(&r->data[key->dsize], data->dptr, data->dsize); /* XXX - probably not necessary when lmaster==dmaster update the ltdb to record the new dmaster */ header->dmaster = r->hdr.destnode; ctdb_ltdb_store(ctdb_db, *key, header, *data); ctdb_queue_packet(ctdb, &r->hdr); talloc_free(r); }
/* called when a CTDB_REQ_DMASTER packet comes in this comes into the lmaster for a record when the current dmaster wants to give up the dmaster role and give it to someone else */ void ctdb_request_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) { struct ctdb_req_dmaster *c = (struct ctdb_req_dmaster *)hdr; struct ctdb_reply_dmaster *r; TDB_DATA key, data, data2; struct ctdb_ltdb_header header; struct ctdb_db_context *ctdb_db; int ret, len; TALLOC_CTX *tmp_ctx; key.dptr = c->data; key.dsize = c->keylen; data.dptr = c->data + c->keylen; data.dsize = c->datalen; ctdb_db = find_ctdb_db(ctdb, c->db_id); if (!ctdb_db) { ctdb_send_error(ctdb, hdr, -1, "Unknown database in request. db_id==0x%08x", c->db_id); return; } /* fetch the current record */ ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header, hdr, &data2, ctdb_recv_raw_pkt, ctdb); if (ret == -1) { ctdb_fatal(ctdb, "ctdb_req_dmaster failed to fetch record"); return; } if (ret == -2) { DEBUG(2,(__location__ " deferring ctdb_request_dmaster\n")); return; } /* its a protocol error if the sending node is not the current dmaster */ if (header.dmaster != hdr->srcnode && hdr->srcnode != ctdb_lmaster(ctdb_db->ctdb, &key)) { ctdb_fatal(ctdb, "dmaster request from non-master"); return; } header.dmaster = c->dmaster; ret = ctdb_ltdb_store(ctdb_db, key, &header, data); ctdb_ltdb_unlock(ctdb_db, key); if (ret != 0) { ctdb_fatal(ctdb, "ctdb_req_dmaster unable to update dmaster"); return; } /* put the packet on a temporary context, allowing us to safely free it below even if ctdb_reply_dmaster() has freed it already */ tmp_ctx = talloc_new(ctdb); /* send the CTDB_REPLY_DMASTER */ len = offsetof(struct ctdb_reply_dmaster, data) + data.dsize; r = ctdb->methods->allocate_pkt(tmp_ctx, len); CTDB_NO_MEMORY_FATAL(ctdb, r); talloc_set_name_const(r, "reply_dmaster packet"); r->hdr.length = len; r->hdr.ctdb_magic = CTDB_MAGIC; r->hdr.ctdb_version = CTDB_VERSION; r->hdr.operation = CTDB_REPLY_DMASTER; r->hdr.destnode = c->dmaster; r->hdr.srcnode = ctdb->vnn; r->hdr.reqid = hdr->reqid; r->datalen = data.dsize; memcpy(&r->data[0], data.dptr, data.dsize); ctdb_queue_packet(ctdb, &r->hdr); talloc_free(tmp_ctx); }
/** * write a record to a normal database * * This is the server-variant of the ctdb_ltdb_store function. * It contains logic to determine whether a record should be * stored or deleted. It also sends SCHEDULE_FOR_DELETION * controls to the local ctdb daemon if apporpriate. */ static int ctdb_ltdb_store_server(struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data) { struct ctdb_context *ctdb = ctdb_db->ctdb; TDB_DATA rec; int ret; bool seqnum_suppressed = false; bool keep = false; bool schedule_for_deletion = false; bool remove_from_delete_queue = false; uint32_t lmaster; if (ctdb->flags & CTDB_FLAG_TORTURE) { struct ctdb_ltdb_header *h2; rec = tdb_fetch(ctdb_db->ltdb->tdb, key); h2 = (struct ctdb_ltdb_header *)rec.dptr; if (rec.dptr && rec.dsize >= sizeof(h2) && h2->rsn > header->rsn) { DEBUG(DEBUG_CRIT,("RSN regression! %llu %llu\n", (unsigned long long)h2->rsn, (unsigned long long)header->rsn)); } if (rec.dptr) free(rec.dptr); } if (ctdb->vnn_map == NULL) { /* * Called from a client: always store the record * Also don't call ctdb_lmaster since it uses the vnn_map! */ keep = true; goto store; } lmaster = ctdb_lmaster(ctdb_db->ctdb, &key); /* * If we migrate an empty record off to another node * and the record has not been migrated with data, * delete the record instead of storing the empty record. */ if (data.dsize != 0) { keep = true; } else if (header->flags & CTDB_REC_RO_FLAGS) { keep = true; } else if (ctdb_db->persistent) { keep = true; } else if (header->flags & CTDB_REC_FLAG_AUTOMATIC) { /* * The record is not created by the client but * automatically by the ctdb_ltdb_fetch logic that * creates a record with an initial header in the * ltdb before trying to migrate the record from * the current lmaster. Keep it instead of trying * to delete the non-existing record... */ keep = true; schedule_for_deletion = true; } else if (header->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) { keep = true; } else if (ctdb_db->ctdb->pnn == lmaster) { /* * If we are lmaster, then we usually keep the record. * But if we retrieve the dmaster role by a VACUUM_MIGRATE * and the record is empty and has never been migrated * with data, then we should delete it instead of storing it. * This is part of the vacuuming process. * * The reason that we usually need to store even empty records * on the lmaster is that a client operating directly on the * lmaster (== dmaster) expects the local copy of the record to * exist after successful ctdb migrate call. If the record does * not exist, the client goes into a migrate loop and eventually * fails. So storing the empty record makes sure that we do not * need to change the client code. */ if (!(header->flags & CTDB_REC_FLAG_VACUUM_MIGRATED)) { keep = true; } else if (ctdb_db->ctdb->pnn != header->dmaster) { keep = true; } } else if (ctdb_db->ctdb->pnn == header->dmaster) { keep = true; } if (keep) { if (!ctdb_db->persistent && (ctdb_db->ctdb->pnn == header->dmaster) && !(header->flags & CTDB_REC_RO_FLAGS)) { header->rsn++; if (data.dsize == 0) { schedule_for_deletion = true; } } remove_from_delete_queue = !schedule_for_deletion; } store: /* * The VACUUM_MIGRATED flag is only set temporarily for * the above logic when the record was retrieved by a * VACUUM_MIGRATE call and should not be stored in the * database. * * The VACUUM_MIGRATE call is triggered by a vacuum fetch, * and there are two cases in which the corresponding record * is stored in the local database: * 1. The record has been migrated with data in the past * (the MIGRATED_WITH_DATA record flag is set). * 2. The record has been filled with data again since it * had been submitted in the VACUUM_FETCH message to the * lmaster. * For such records it is important to not store the * VACUUM_MIGRATED flag in the database. */ header->flags &= ~CTDB_REC_FLAG_VACUUM_MIGRATED; /* * Similarly, clear the AUTOMATIC flag which should not enter * the local database copy since this would require client * modifications to clear the flag when the client stores * the record. */ header->flags &= ~CTDB_REC_FLAG_AUTOMATIC; rec.dsize = sizeof(*header) + data.dsize; rec.dptr = talloc_size(ctdb, rec.dsize); CTDB_NO_MEMORY(ctdb, rec.dptr); memcpy(rec.dptr, header, sizeof(*header)); memcpy(rec.dptr + sizeof(*header), data.dptr, data.dsize); /* Databases with seqnum updates enabled only get their seqnum changes when/if we modify the data */ if (ctdb_db->seqnum_update != NULL) { TDB_DATA old; old = tdb_fetch(ctdb_db->ltdb->tdb, key); if ( (old.dsize == rec.dsize) && !memcmp(old.dptr+sizeof(struct ctdb_ltdb_header), rec.dptr+sizeof(struct ctdb_ltdb_header), rec.dsize-sizeof(struct ctdb_ltdb_header)) ) { tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_SEQNUM); seqnum_suppressed = true; } if (old.dptr) free(old.dptr); } DEBUG(DEBUG_DEBUG, (__location__ " db[%s]: %s record: hash[0x%08x]\n", ctdb_db->db_name, keep?"storing":"deleting", ctdb_hash(&key))); if (keep) { ret = tdb_store(ctdb_db->ltdb->tdb, key, rec, TDB_REPLACE); } else { ret = tdb_delete(ctdb_db->ltdb->tdb, key); } if (ret != 0) { int lvl = DEBUG_ERR; if (keep == false && tdb_error(ctdb_db->ltdb->tdb) == TDB_ERR_NOEXIST) { lvl = DEBUG_DEBUG; } DEBUG(lvl, (__location__ " db[%s]: Failed to %s record: " "%d - %s\n", ctdb_db->db_name, keep?"store":"delete", ret, tdb_errorstr(ctdb_db->ltdb->tdb))); schedule_for_deletion = false; remove_from_delete_queue = false; } if (seqnum_suppressed) { tdb_add_flags(ctdb_db->ltdb->tdb, TDB_SEQNUM); } talloc_free(rec.dptr); if (schedule_for_deletion) { int ret2; ret2 = ctdb_local_schedule_for_deletion(ctdb_db, header, key); if (ret2 != 0) { DEBUG(DEBUG_ERR, (__location__ " ctdb_local_schedule_for_deletion failed.\n")); } } if (remove_from_delete_queue) { ctdb_local_remove_from_delete_queue(ctdb_db, header, key); } return ret; }
/* main program */ int main(int argc, const char *argv[]) { struct ctdb_context *ctdb; TDB_DATA key; struct poptOption popt_options[] = { POPT_AUTOHELP POPT_CTDB_CMDLINE { "record", 'r', POPT_ARG_STRING, &TESTKEY, 0, "record", "string" }, POPT_TABLEEND }; int opt, ret; const char **extra_argv; int extra_argc = 0; poptContext pc; struct event_context *ev; pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST); while ((opt = poptGetNextOpt(pc)) != -1) { switch (opt) { default: fprintf(stderr, "Invalid option %s: %s\n", poptBadOption(pc, 0), poptStrerror(opt)); exit(1); } } /* setup the remaining options for the main program to use */ extra_argv = poptGetArgs(pc); if (extra_argv) { extra_argv++; while (extra_argv[extra_argc]) extra_argc++; } ev = event_context_init(NULL); ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(5, 0)); if (ctdb == NULL) { exit(1); } key.dptr = discard_const(TESTKEY); key.dsize = strlen(TESTKEY); ret = ctdb_ctrl_getvnnmap(ctdb, timeval_zero(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map); if (ret != 0) { printf("failed to get vnnmap\n"); exit(10); } printf("Record:%s\n", TESTKEY); printf("Lmaster : %d\n", ctdb_lmaster(ctdb, &key)); /* attach to a specific database */ ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(5, 0), "test.tdb", false, 0); if (!ctdb_db) { printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb)); exit(1); } printf("Waiting for cluster\n"); while (1) { uint32_t recmode=1; ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode); if (recmode == 0) break; event_loop_once(ev); } while (1) { fetch_lock_once(ctdb, ev); } return 0; }