loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 W20220422 10:34:16.130400 7626 rpc_client.cpp:190] LoadServer 10.7.0.203 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220422 10:34:16.130342 7625 rpc_client.cpp:190] LoadServer 10.7.0.203 Failed at 0 times error_code 14 error_message failed to connect to all addresses ------------------------ arguments ------------------------ batch_size ...................................... 131072 batch_size_per_proc ............................. 16384 data_dir ........................................ /dataset/f9f659c5/wdl_ofrecord data_part_name_suffix_length .................... 5 data_part_num ................................... 256 dataset_format .................................. ofrecord ddp ............................................. True deep_dropout_rate ............................... 0.5 deep_embedding_vec_size ......................... 16 deep_vocab_size ................................. 2322444 eval_after_training ............................. False eval_batchs ..................................... 20 eval_interval ................................... 0 execution_mode .................................. eager hidden_size ..................................... 1024 hidden_units_num ................................ 2 learning_rate ................................... 0.001 loss_print_every_n_iter ......................... 100 max_iter ........................................ 1100 model_load_dir .................................. model_save_dir .................................. ./checkpoint num_deep_sparse_fields .......................... 26 num_dense_fields ................................ 13 num_wide_sparse_fields .......................... 2 save_initial_model .............................. False save_model_after_each_eval ...................... False test_name ....................................... noname_test wide_vocab_size ................................. 2322444 -------------------- end of arguments --------------------- [rank:0] iter: 100/1100, loss: 0.5025784373283386, latency(ms): 105.4967971891164780 | 2022-04-22 10:34:38.274 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/22 10:34:39.010, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/22 10:34:39.016, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/22 10:34:39.022, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/22 10:34:39.025, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/22 10:34:39.031, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/22 10:34:39.036, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/22 10:34:39.037, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/22 10:34:39.043, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/22 10:34:39.043, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/22 10:34:39.046, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/22 10:34:39.047, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/22 10:34:39.047, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/22 10:34:39.049, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/22 10:34:39.050, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/22 10:34:39.053, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/22 10:34:39.053, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/22 10:34:39.056, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/22 10:34:39.058, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/22 10:34:39.060, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/22 10:34:39.060, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/22 10:34:39.063, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/22 10:34:39.065, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/22 10:34:39.067, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/22 10:34:39.067, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/22 10:34:39.070, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/22 10:34:39.071, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/22 10:34:39.073, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/22 10:34:39.074, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/22 10:34:39.076, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/22 10:34:39.078, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/22 10:34:39.084, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/22 10:34:39.084, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/22 10:34:39.087, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/22 10:34:39.088, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/22 10:34:39.090, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/22 10:34:39.093, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/22 10:34:39.094, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/22 10:34:39.096, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/22 10:34:39.100, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/22 10:34:39.101, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/22 10:34:39.690, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/22 10:34:39.692, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/22 10:34:39.693, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/22 10:34:39.695, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/22 10:34:39.696, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/22 10:34:39.696, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/22 10:34:39.697, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/22 10:34:39.697, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/22 10:34:39.699, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/22 10:34:39.699, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/22 10:34:39.699, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/22 10:34:39.701, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/22 10:34:39.701, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/22 10:34:39.702, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/22 10:34:39.704, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/22 10:34:39.705, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/22 10:34:39.707, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/22 10:34:39.708, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/22 10:34:39.710, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/22 10:34:39.711, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/22 10:34:39.713, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/22 10:34:39.714, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/22 10:34:39.718, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/22 10:34:39.720, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB [rank:0] iter: 200/1100, loss: 0.4683345556259155, latency(ms): 54.0533579885959625 | 2022-04-22 10:34:43.680 [rank:0] iter: 300/1100, loss: 0.4603804051876068, latency(ms): 40.3721096366643906 | 2022-04-22 10:34:47.717 [rank:0] iter: 400/1100, loss: 0.4518880844116211, latency(ms): 39.4867357984185219 | 2022-04-22 10:34:51.665 [rank:0] iter: 500/1100, loss: 0.4502055346965790, latency(ms): 40.1143591850996017 | 2022-04-22 10:34:55.677 [rank:0] iter: 600/1100, loss: 0.4463137090206146, latency(ms): 42.0586733520030975 | 2022-04-22 10:34:59.882 [rank:0] iter: 700/1100, loss: 0.4400443732738495, latency(ms): 42.9905378818511963 | 2022-04-22 10:35:04.182 [rank:0] iter: 800/1100, loss: 0.4422062039375305, latency(ms): 43.2768443599343300 | 2022-04-22 10:35:08.509 [rank:0] iter: 900/1100, loss: 0.4324224889278412, latency(ms): 42.5715249404311180 | 2022-04-22 10:35:12.766 [rank:0] iter: 1000/1100, loss: 0.4311563968658447, latency(ms): 41.0006738826632500 | 2022-04-22 10:35:16.866 [rank:0] iter: 1100/1100, loss: 0.4330929815769196, latency(ms): 41.9159888476133347 | 2022-04-22 10:35:21.058