loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loaded library: loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1 W20220426 10:13:01.286526 7635 rpc_client.cpp:190] LoadServer 10.7.144.105 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220426 10:13:01.286517 7636 rpc_client.cpp:190] LoadServer 10.7.144.105 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220426 10:13:01.286160 7637 rpc_client.cpp:190] LoadServer 10.7.144.105 Failed at 0 times error_code 14 error_message failed to connect to all addresses ------------------------ arguments ------------------------ batch_size ...................................... 131072 batch_size_per_proc ............................. 16384 data_dir ........................................ /dataset/f9f659c5/wdl_ofrecord data_part_name_suffix_length .................... 5 data_part_num ................................... 256 dataset_format .................................. ofrecord ddp ............................................. True deep_dropout_rate ............................... 0.5 deep_embedding_vec_size ......................... 16 deep_vocab_size ................................. 2322444 eval_after_training ............................. False eval_batchs ..................................... 20 eval_interval ................................... 0 execution_mode .................................. eager hidden_size ..................................... 1024 hidden_units_num ................................ 2 learning_rate ................................... 0.001 loss_print_every_n_iter ......................... 100 max_iter ........................................ 1100 model_load_dir .................................. model_save_dir .................................. ./checkpoint num_deep_sparse_fields .......................... 26 num_dense_fields ................................ 13 num_wide_sparse_fields .......................... 2 save_initial_model .............................. False save_model_after_each_eval ...................... False test_name ....................................... noname_test wide_vocab_size ................................. 2322444 -------------------- end of arguments --------------------- [rank:0] iter: 100/1100, loss: 0.5031045079231262, latency(ms): 102.5100069493055344 | 2022-04-26 10:13:23.189 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/26 10:13:23.370, Tesla V100-SXM2-32GB, 470.57.02, 49 %, 14 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/26 10:13:23.374, Tesla V100-SXM2-32GB, 470.57.02, 49 %, 14 %, 32510 MiB, 29888 MiB, 2622 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/26 10:13:23.376, Tesla V100-SXM2-32GB, 470.57.02, 49 %, 14 %, 32510 MiB, 29888 MiB, 2622 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/26 10:13:23.378, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/26 10:13:23.378, Tesla V100-SXM2-32GB, 470.57.02, 49 %, 14 %, 32510 MiB, 29888 MiB, 2622 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/26 10:13:23.383, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/26 10:13:23.385, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/26 10:13:23.385, Tesla V100-SXM2-32GB, 470.57.02, 49 %, 14 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/26 10:13:23.387, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 14 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/26 10:13:23.387, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/26 10:13:23.386, Tesla V100-SXM2-32GB, 470.57.02, 49 %, 14 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/26 10:13:23.389, Tesla V100-SXM2-32GB, 470.57.02, 49 %, 14 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/26 10:13:23.392, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 14 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/26 10:13:23.394, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 14 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/26 10:13:23.395, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/26 10:13:23.397, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/26 10:13:23.397, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 14 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/26 10:13:23.398, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/26 10:13:23.400, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/26 10:13:23.403, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/26 10:13:23.405, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/26 10:13:23.406, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 14 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/26 10:13:23.408, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/26 10:13:23.408, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/26 10:13:23.408, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 14 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/26 10:13:23.411, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 14 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/26 10:13:23.413, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/26 10:13:23.413, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/26 10:13:23.416, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/26 10:13:23.416, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/26 10:13:23.418, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/26 10:13:23.418, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/26 10:13:23.419, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/26 10:13:23.421, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/26 10:13:23.424, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/26 10:13:23.425, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/26 10:13:23.427, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/26 10:13:23.427, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/26 10:13:23.429, Tesla V100-SXM2-32GB, 470.57.02, 33 %, 13 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/26 10:13:23.430, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/26 10:13:23.430, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/26 10:13:23.434, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/26 10:13:23.437, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/26 10:13:23.437, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/26 10:13:23.439, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/26 10:13:23.440, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/26 10:13:23.442, Tesla V100-SXM2-32GB, 470.57.02, 13 %, 5 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/26 10:13:23.442, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/26 10:13:23.443, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/26 10:13:23.445, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/26 10:13:23.448, Tesla V100-SXM2-32GB, 470.57.02, 13 %, 5 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/26 10:13:23.448, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/26 10:13:23.450, Tesla V100-SXM2-32GB, 470.57.02, 13 %, 5 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/26 10:13:23.451, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/26 10:13:23.453, Tesla V100-SXM2-32GB, 470.57.02, 13 %, 5 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/26 10:13:23.454, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/26 10:13:23.456, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/26 10:13:23.459, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/26 10:13:23.462, Tesla V100-SXM2-32GB, 470.57.02, 13 %, 5 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/26 10:13:23.464, Tesla V100-SXM2-32GB, 470.57.02, 13 %, 5 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/26 10:13:23.467, Tesla V100-SXM2-32GB, 470.57.02, 13 %, 5 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/26 10:13:23.469, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/26 10:13:23.478, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/26 10:13:23.486, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB [rank:0] iter: 200/1100, loss: 0.4703223109245300, latency(ms): 43.5024400055408478 | 2022-04-26 10:13:27.539 [rank:0] iter: 300/1100, loss: 0.4609257578849792, latency(ms): 42.2127199172973633 | 2022-04-26 10:13:31.761 [rank:0] iter: 400/1100, loss: 0.4522405564785004, latency(ms): 41.3103706762194633 | 2022-04-26 10:13:35.892 [rank:0] iter: 500/1100, loss: 0.4504601955413818, latency(ms): 41.5960429236292839 | 2022-04-26 10:13:40.051 [rank:0] iter: 600/1100, loss: 0.4464605748653412, latency(ms): 42.5574875250458717 | 2022-04-26 10:13:44.307 [rank:0] iter: 700/1100, loss: 0.4401438236236572, latency(ms): 42.5118567049503326 | 2022-04-26 10:13:48.558 [rank:0] iter: 800/1100, loss: 0.4424374401569366, latency(ms): 42.1159565076231956 | 2022-04-26 10:13:52.770 [rank:0] iter: 900/1100, loss: 0.4326224029064178, latency(ms): 42.5397039577364922 | 2022-04-26 10:13:57.024 [rank:0] iter: 1000/1100, loss: 0.4313187003135681, latency(ms): 43.0840174853801727 | 2022-04-26 10:14:01.332 [rank:0] iter: 1100/1100, loss: 0.4328897893428802, latency(ms): 43.0635387822985649 | 2022-04-26 10:14:05.639