| 1 | /* |
| 2 | * rw_request_hash.c |
| 3 | * |
| 4 | * Copyright (C) 2016 Aerospike, Inc. |
| 5 | * |
| 6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
| 7 | * license agreements. |
| 8 | * |
| 9 | * This program is free software: you can redistribute it and/or modify it under |
| 10 | * the terms of the GNU Affero General Public License as published by the Free |
| 11 | * Software Foundation, either version 3 of the License, or (at your option) any |
| 12 | * later version. |
| 13 | * |
| 14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
| 15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
| 17 | * details. |
| 18 | * |
| 19 | * You should have received a copy of the GNU Affero General Public License |
| 20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
| 21 | */ |
| 22 | |
| 23 | //========================================================== |
| 24 | // Includes. |
| 25 | // |
| 26 | |
| 27 | #include "transaction/rw_request_hash.h" |
| 28 | |
| 29 | #include <stdbool.h> |
| 30 | #include <stddef.h> |
| 31 | #include <stdint.h> |
| 32 | #include <string.h> |
| 33 | #include <unistd.h> |
| 34 | |
| 35 | #include "citrusleaf/alloc.h" |
| 36 | #include "citrusleaf/cf_atomic.h" |
| 37 | #include "citrusleaf/cf_clock.h" |
| 38 | |
| 39 | #include "cf_mutex.h" |
| 40 | #include "cf_thread.h" |
| 41 | #include "fault.h" |
| 42 | #include "msg.h" |
| 43 | #include "node.h" |
| 44 | #include "rchash.h" |
| 45 | |
| 46 | #include "base/cfg.h" |
| 47 | #include "base/datamodel.h" |
| 48 | #include "base/proto.h" |
| 49 | #include "base/transaction.h" |
| 50 | #include "base/transaction_policy.h" |
| 51 | #include "fabric/fabric.h" |
| 52 | #include "transaction/duplicate_resolve.h" |
| 53 | #include "transaction/replica_ping.h" |
| 54 | #include "transaction/replica_write.h" |
| 55 | #include "transaction/rw_request.h" |
| 56 | #include "transaction/rw_utils.h" |
| 57 | |
| 58 | |
| 59 | //========================================================== |
| 60 | // Typedefs & constants. |
| 61 | // |
| 62 | |
| 63 | const msg_template rw_mt[] = { |
| 64 | { RW_FIELD_OP, M_FT_UINT32 }, |
| 65 | { RW_FIELD_RESULT, M_FT_UINT32 }, |
| 66 | { RW_FIELD_NAMESPACE, M_FT_BUF }, |
| 67 | { RW_FIELD_NS_ID, M_FT_UINT32 }, |
| 68 | { RW_FIELD_GENERATION, M_FT_UINT32 }, |
| 69 | { RW_FIELD_DIGEST, M_FT_BUF }, |
| 70 | { RW_FIELD_RECORD, M_FT_BUF }, |
| 71 | { RW_FIELD_UNUSED_7, M_FT_BUF }, |
| 72 | { RW_FIELD_CLUSTER_KEY, M_FT_UINT64 }, |
| 73 | { RW_FIELD_OLD_RECORD, M_FT_BUF }, |
| 74 | { RW_FIELD_TID, M_FT_UINT32 }, |
| 75 | { RW_FIELD_VOID_TIME, M_FT_UINT32 }, |
| 76 | { RW_FIELD_INFO, M_FT_UINT32 }, |
| 77 | { RW_FIELD_UNUSED_13, M_FT_BUF }, |
| 78 | { RW_FIELD_UNUSED_14, M_FT_BUF }, |
| 79 | { RW_FIELD_UNUSED_15, M_FT_UINT64 }, |
| 80 | { RW_FIELD_LAST_UPDATE_TIME, M_FT_UINT64 }, |
| 81 | { RW_FIELD_SET_NAME, M_FT_BUF }, |
| 82 | { RW_FIELD_KEY, M_FT_BUF }, |
| 83 | { RW_FIELD_REGIME, M_FT_UINT32 } |
| 84 | }; |
| 85 | |
| 86 | COMPILER_ASSERT(sizeof(rw_mt) / sizeof(msg_template) == NUM_RW_FIELDS); |
| 87 | |
| 88 | #define RW_MSG_SCRATCH_SIZE 192 |
| 89 | |
| 90 | |
| 91 | //========================================================== |
| 92 | // Globals. |
| 93 | // |
| 94 | |
| 95 | static cf_rchash* g_rw_request_hash = NULL; |
| 96 | |
| 97 | |
| 98 | //========================================================== |
| 99 | // Forward declarations. |
| 100 | // |
| 101 | |
| 102 | uint32_t rw_request_hash_fn(const void* value, uint32_t value_len); |
| 103 | transaction_status handle_hot_key(rw_request* rw0, as_transaction* tr); |
| 104 | |
| 105 | void* run_retransmit(void* arg); |
| 106 | int retransmit_reduce_fn(const void* key, uint32_t keylen, void* data, void* udata); |
| 107 | void update_retransmit_stats(const rw_request* rw); |
| 108 | |
| 109 | int rw_msg_cb(cf_node id, msg* m, void* udata); |
| 110 | |
| 111 | |
| 112 | //========================================================== |
| 113 | // Public API. |
| 114 | // |
| 115 | |
| 116 | void |
| 117 | as_rw_init() |
| 118 | { |
| 119 | g_rw_request_hash = cf_rchash_create(rw_request_hash_fn, |
| 120 | rw_request_hdestroy, sizeof(rw_request_hkey), 32 * 1024, |
| 121 | CF_RCHASH_MANY_LOCK); |
| 122 | |
| 123 | cf_thread_create_detached(run_retransmit, NULL); |
| 124 | |
| 125 | as_fabric_register_msg_fn(M_TYPE_RW, rw_mt, sizeof(rw_mt), |
| 126 | RW_MSG_SCRATCH_SIZE, rw_msg_cb, NULL); |
| 127 | } |
| 128 | |
| 129 | |
| 130 | uint32_t |
| 131 | rw_request_hash_count() |
| 132 | { |
| 133 | return cf_rchash_get_size(g_rw_request_hash); |
| 134 | } |
| 135 | |
| 136 | |
| 137 | transaction_status |
| 138 | rw_request_hash_insert(rw_request_hkey* hkey, rw_request* rw, |
| 139 | as_transaction* tr) |
| 140 | { |
| 141 | while (cf_rchash_put_unique(g_rw_request_hash, hkey, sizeof(*hkey), rw) != |
| 142 | CF_RCHASH_OK) { |
| 143 | // rw_request with this digest already in hash - get it. |
| 144 | |
| 145 | rw_request* rw0; |
| 146 | |
| 147 | if (cf_rchash_get(g_rw_request_hash, hkey, sizeof(*hkey), |
| 148 | (void**)&rw0) != CF_RCHASH_OK) { |
| 149 | // But now it's gone - try insertion again immediately. |
| 150 | continue; |
| 151 | } |
| 152 | // else - got it - handle "hot key" scenario. |
| 153 | |
| 154 | cf_mutex_lock(&rw0->lock); |
| 155 | |
| 156 | transaction_status status = handle_hot_key(rw0, tr); |
| 157 | |
| 158 | cf_mutex_unlock(&rw0->lock); |
| 159 | rw_request_release(rw0); |
| 160 | |
| 161 | return status; // rw_request was not inserted in the hash |
| 162 | } |
| 163 | |
| 164 | return TRANS_IN_PROGRESS; // rw_request was inserted in the hash |
| 165 | } |
| 166 | |
| 167 | |
| 168 | void |
| 169 | rw_request_hash_delete(rw_request_hkey* hkey, rw_request* rw) |
| 170 | { |
| 171 | cf_rchash_delete_object(g_rw_request_hash, hkey, sizeof(*hkey), rw); |
| 172 | } |
| 173 | |
| 174 | |
| 175 | rw_request* |
| 176 | rw_request_hash_get(rw_request_hkey* hkey) |
| 177 | { |
| 178 | rw_request* rw = NULL; |
| 179 | |
| 180 | cf_rchash_get(g_rw_request_hash, hkey, sizeof(*hkey), (void**)&rw); |
| 181 | |
| 182 | return rw; |
| 183 | } |
| 184 | |
| 185 | |
| 186 | // For debugging only. |
| 187 | void |
| 188 | rw_request_hash_dump() |
| 189 | { |
| 190 | cf_info(AS_RW, "rw_request_hash dump not yet implemented" ); |
| 191 | // TODO - implement something, or deprecate. |
| 192 | } |
| 193 | |
| 194 | |
| 195 | //========================================================== |
| 196 | // Local helpers - hash insertion. |
| 197 | // |
| 198 | |
| 199 | uint32_t |
| 200 | rw_request_hash_fn(const void* key, uint32_t key_size) |
| 201 | { |
| 202 | rw_request_hkey* hkey = (rw_request_hkey*)key; |
| 203 | |
| 204 | return *(uint32_t*)&hkey->keyd.digest[DIGEST_HASH_BASE_BYTE]; |
| 205 | } |
| 206 | |
| 207 | |
| 208 | transaction_status |
| 209 | handle_hot_key(rw_request* rw0, as_transaction* tr) |
| 210 | { |
| 211 | as_namespace* ns = tr->rsv.ns; |
| 212 | |
| 213 | if (tr->origin == FROM_RE_REPL) { |
| 214 | // Always put this transaction at the head of the original rw_request's |
| 215 | // queue - it will be retried (first) when the original is complete. |
| 216 | rw_request_wait_q_push_head(rw0, tr); |
| 217 | |
| 218 | return TRANS_WAITING; |
| 219 | } |
| 220 | else if (ns->transaction_pending_limit != 0 && |
| 221 | rw0->wait_queue_depth > ns->transaction_pending_limit) { |
| 222 | // If we're over the hot key pending limit, fail this transaction. |
| 223 | cf_detail_digest(AS_RW, &tr->keyd, "{%s} key busy " , ns->name); |
| 224 | |
| 225 | cf_atomic64_incr(&ns->n_fail_key_busy); |
| 226 | tr->result_code = AS_ERR_KEY_BUSY; |
| 227 | |
| 228 | return TRANS_DONE_ERROR; |
| 229 | } |
| 230 | else { |
| 231 | // Queue this transaction on the original rw_request - it will be |
| 232 | // retried when the original is complete. |
| 233 | rw_request_wait_q_push(rw0, tr); |
| 234 | |
| 235 | return TRANS_WAITING; |
| 236 | } |
| 237 | } |
| 238 | |
| 239 | |
| 240 | //========================================================== |
| 241 | // Local helpers - retransmit. |
| 242 | // |
| 243 | |
| 244 | void* |
| 245 | run_retransmit(void* arg) |
| 246 | { |
| 247 | while (true) { |
| 248 | usleep(130 * 1000); |
| 249 | |
| 250 | now_times now; |
| 251 | |
| 252 | now.now_ns = cf_getns(); |
| 253 | now.now_ms = now.now_ns / 1000000; |
| 254 | |
| 255 | cf_rchash_reduce(g_rw_request_hash, retransmit_reduce_fn, &now); |
| 256 | } |
| 257 | |
| 258 | return NULL; |
| 259 | } |
| 260 | |
| 261 | |
| 262 | int |
| 263 | retransmit_reduce_fn(const void* key, uint32_t keylen, void* data, void* udata) |
| 264 | { |
| 265 | rw_request* rw = data; |
| 266 | now_times* now = (now_times*)udata; |
| 267 | |
| 268 | if (! rw->is_set_up) { |
| 269 | return 0; |
| 270 | } |
| 271 | |
| 272 | if (now->now_ns > rw->end_time) { |
| 273 | cf_mutex_lock(&rw->lock); |
| 274 | |
| 275 | rw->timeout_cb(rw); |
| 276 | |
| 277 | cf_mutex_unlock(&rw->lock); |
| 278 | |
| 279 | return CF_RCHASH_REDUCE_DELETE; |
| 280 | } |
| 281 | |
| 282 | if (rw->xmit_ms < now->now_ms) { |
| 283 | cf_mutex_lock(&rw->lock); |
| 284 | |
| 285 | if (rw->from.any) { |
| 286 | rw->xmit_ms = now->now_ms + rw->retry_interval_ms; |
| 287 | rw->retry_interval_ms *= 2; |
| 288 | |
| 289 | send_rw_messages(rw); |
| 290 | update_retransmit_stats(rw); |
| 291 | } |
| 292 | // else - lost race against dup-res or repl-write callback. |
| 293 | |
| 294 | cf_mutex_unlock(&rw->lock); |
| 295 | } |
| 296 | |
| 297 | return 0; |
| 298 | } |
| 299 | |
| 300 | |
| 301 | void |
| 302 | update_retransmit_stats(const rw_request* rw) |
| 303 | { |
| 304 | as_namespace* ns = rw->rsv.ns; |
| 305 | as_msg* m = &rw->msgp->msg; |
| 306 | bool is_dup_res = rw->repl_write_cb == NULL; |
| 307 | |
| 308 | // Note - only one retransmit thread, so no need for atomic increments. |
| 309 | |
| 310 | switch (rw->origin) { |
| 311 | case FROM_PROXY: |
| 312 | if (rw_request_is_batch_sub(rw)) { |
| 313 | ns->n_retransmit_all_batch_sub_dup_res++; |
| 314 | break; |
| 315 | } |
| 316 | // No break. |
| 317 | case FROM_CLIENT: { |
| 318 | bool is_write = (m->info2 & AS_MSG_INFO2_WRITE) != 0; |
| 319 | bool is_delete = (m->info2 & AS_MSG_INFO2_DELETE) != 0; |
| 320 | bool is_udf = (rw->msg_fields & AS_MSG_FIELD_BIT_UDF_FILENAME) != 0; |
| 321 | |
| 322 | if (is_dup_res) { |
| 323 | if (is_write) { |
| 324 | if (is_delete) { |
| 325 | ns->n_retransmit_all_delete_dup_res++; |
| 326 | } |
| 327 | else if (is_udf) { |
| 328 | ns->n_retransmit_all_udf_dup_res++; |
| 329 | } |
| 330 | else { |
| 331 | ns->n_retransmit_all_write_dup_res++; |
| 332 | } |
| 333 | } |
| 334 | else { |
| 335 | ns->n_retransmit_all_read_dup_res++; |
| 336 | } |
| 337 | } |
| 338 | else { |
| 339 | cf_assert(is_write, AS_RW, "read doing replica write" ); |
| 340 | |
| 341 | if (is_delete) { |
| 342 | ns->n_retransmit_all_delete_repl_write++; |
| 343 | } |
| 344 | else if (is_udf) { |
| 345 | ns->n_retransmit_all_udf_repl_write++; |
| 346 | } |
| 347 | else { |
| 348 | ns->n_retransmit_all_write_repl_write++; |
| 349 | } |
| 350 | } |
| 351 | } |
| 352 | break; |
| 353 | case FROM_BATCH: |
| 354 | // For now batch sub transactions are read-only. |
| 355 | ns->n_retransmit_all_batch_sub_dup_res++; |
| 356 | break; |
| 357 | case FROM_IUDF: |
| 358 | if (is_dup_res) { |
| 359 | ns->n_retransmit_udf_sub_dup_res++; |
| 360 | } |
| 361 | else { |
| 362 | ns->n_retransmit_udf_sub_repl_write++; |
| 363 | } |
| 364 | break; |
| 365 | case FROM_IOPS: |
| 366 | if (is_dup_res) { |
| 367 | ns->n_retransmit_ops_sub_dup_res++; |
| 368 | } |
| 369 | else { |
| 370 | ns->n_retransmit_ops_sub_repl_write++; |
| 371 | } |
| 372 | break; |
| 373 | case FROM_RE_REPL: |
| 374 | // For now we don't report re-replication retransmit stats. |
| 375 | break; |
| 376 | default: |
| 377 | cf_crash(AS_RW, "unexpected transaction origin %u" , rw->origin); |
| 378 | break; |
| 379 | } |
| 380 | } |
| 381 | |
| 382 | |
| 383 | //========================================================== |
| 384 | // Local helpers - handle RW fabric messages. |
| 385 | // |
| 386 | |
| 387 | int |
| 388 | rw_msg_cb(cf_node id, msg* m, void* udata) |
| 389 | { |
| 390 | uint32_t op; |
| 391 | |
| 392 | if (msg_get_uint32(m, RW_FIELD_OP, &op) != 0) { |
| 393 | cf_warning(AS_RW, "got rw msg without op field" ); |
| 394 | as_fabric_msg_put(m); |
| 395 | return 0; |
| 396 | } |
| 397 | |
| 398 | switch (op) { |
| 399 | //-------------------------------------------- |
| 400 | // Duplicate resolution: |
| 401 | // |
| 402 | case RW_OP_DUP: |
| 403 | dup_res_handle_request(id, m); |
| 404 | break; |
| 405 | case RW_OP_DUP_ACK: |
| 406 | dup_res_handle_ack(id, m); |
| 407 | break; |
| 408 | |
| 409 | //-------------------------------------------- |
| 410 | // Replica writes: |
| 411 | // |
| 412 | case RW_OP_REPL_WRITE: |
| 413 | repl_write_handle_op(id, m); |
| 414 | break; |
| 415 | case RW_OP_WRITE: |
| 416 | repl_write_handle_old_op(id, m); |
| 417 | break; |
| 418 | case RW_OP_WRITE_ACK: |
| 419 | repl_write_handle_ack(id, m); |
| 420 | break; |
| 421 | case RW_OP_REPL_CONFIRM: |
| 422 | repl_write_handle_confirmation(m); |
| 423 | break; |
| 424 | |
| 425 | //-------------------------------------------- |
| 426 | // Replica pings: |
| 427 | // |
| 428 | case RW_OP_REPL_PING: |
| 429 | repl_ping_handle_op(id, m); |
| 430 | break; |
| 431 | case RW_OP_REPL_PING_ACK: |
| 432 | repl_ping_handle_ack(id, m); |
| 433 | break; |
| 434 | |
| 435 | default: |
| 436 | cf_warning(AS_RW, "got rw msg with unrecognized op %u" , op); |
| 437 | as_fabric_msg_put(m); |
| 438 | break; |
| 439 | } |
| 440 | |
| 441 | return 0; |
| 442 | } |
| 443 | |