loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loaded library: loaded library: loaded library: loaded library: loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 W20220409 02:14:57.728325 7637 rpc_client.cpp:190] LoadServer 10.7.102.248 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220409 02:14:57.732436 7638 rpc_client.cpp:190] LoadServer 10.7.102.248 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220409 02:14:57.735468 7636 rpc_client.cpp:190] LoadServer 10.7.102.248 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220409 02:14:57.730289 7642 rpc_client.cpp:190] LoadServer 10.7.102.248 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220409 02:14:57.735158 7639 rpc_client.cpp:190] LoadServer 10.7.102.248 Failed at 0 times error_code 14 error_message failed to connect to all addresses ------------------------ arguments ------------------------ batch_size ...................................... 131072 batch_size_per_proc ............................. 16384 data_dir ........................................ /dataset/f9f659c5/wdl_ofrecord data_part_name_suffix_length .................... 5 data_part_num ................................... 256 dataset_format .................................. ofrecord ddp ............................................. True deep_dropout_rate ............................... 0.5 deep_embedding_vec_size ......................... 16 deep_vocab_size ................................. 2322444 eval_after_training ............................. False eval_batchs ..................................... 20 eval_interval ................................... 0 execution_mode .................................. eager hidden_size ..................................... 1024 hidden_units_num ................................ 2 learning_rate ................................... 0.001 loss_print_every_n_iter ......................... 100 max_iter ........................................ 1100 model_load_dir .................................. model_save_dir .................................. ./checkpoint num_deep_sparse_fields .......................... 26 num_dense_fields ................................ 13 num_wide_sparse_fields .......................... 2 save_initial_model .............................. False save_model_after_each_eval ...................... False test_name ....................................... noname_test wide_vocab_size ................................. 2322444 -------------------- end of arguments --------------------- [rank:0] iter: 100/1100, loss: 0.5036971569061279, latency(ms): 120.2594156563282013 | 2022-04-09 02:15:21.352 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:15:21.485, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/09 02:15:21.491, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29888 MiB, 2622 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:15:21.492, Tesla V100-SXM2-32GB, 470.57.02, 46 %, 8 %, 32510 MiB, 29908 MiB, 2602 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:15:21.495, Tesla V100-SXM2-32GB, 470.57.02, 46 %, 8 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/09 02:15:21.496, Tesla V100-SXM2-32GB, 470.57.02, 19 %, 11 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/09 02:15:21.496, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29888 MiB, 2622 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:15:21.499, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/09 02:15:21.500, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/09 02:15:21.501, Tesla V100-SXM2-32GB, 470.57.02, 19 %, 11 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/09 02:15:21.502, Tesla V100-SXM2-32GB, 470.57.02, 75 %, 21 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/09 02:15:21.503, Tesla V100-SXM2-32GB, 470.57.02, 46 %, 8 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/09 02:15:21.507, Tesla V100-SXM2-32GB, 470.57.02, 46 %, 8 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/09 02:15:21.506, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/09 02:15:21.508, Tesla V100-SXM2-32GB, 470.57.02, 46 %, 8 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/09 02:15:21.509, Tesla V100-SXM2-32GB, 470.57.02, 75 %, 21 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/09 02:15:21.510, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 13 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/09 02:15:21.510, Tesla V100-SXM2-32GB, 470.57.02, 19 %, 11 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/09 02:15:21.515, Tesla V100-SXM2-32GB, 470.57.02, 19 %, 11 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/09 02:15:21.515, Tesla V100-SXM2-32GB, 470.57.02, 46 %, 8 %, 32510 MiB, 29908 MiB, 2602 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:15:21.517, Tesla V100-SXM2-32GB, 470.57.02, 19 %, 11 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/09 02:15:21.517, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 13 %, 32510 MiB, 29968 MiB, 2542 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:15:21.518, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 18 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/09 02:15:21.518, Tesla V100-SXM2-32GB, 470.57.02, 75 %, 21 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/09 02:15:21.522, Tesla V100-SXM2-32GB, 470.57.02, 75 %, 21 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/09 02:15:21.522, Tesla V100-SXM2-32GB, 470.57.02, 19 %, 11 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/09 02:15:21.522, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/09 02:15:21.523, Tesla V100-SXM2-32GB, 470.57.02, 75 %, 21 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/09 02:15:21.524, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 18 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/09 02:15:21.524, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29888 MiB, 2622 MiB 2022/04/09 02:15:21.525, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 8 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/09 02:15:21.525, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 13 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/09 02:15:21.530, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 13 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/09 02:15:21.530, Tesla V100-SXM2-32GB, 470.57.02, 75 %, 21 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/09 02:15:21.532, Tesla V100-SXM2-32GB, 470.57.02, 46 %, 8 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/09 02:15:21.532, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 13 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/09 02:15:21.532, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 8 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/09 02:15:21.533, Tesla V100-SXM2-32GB, 470.57.02, 46 %, 8 %, 32510 MiB, 29908 MiB, 2602 MiB 2022/04/09 02:15:21.533, Tesla V100-SXM2-32GB, 470.57.02, 71 %, 21 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/09 02:15:21.534, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 18 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/09 02:15:21.538, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 18 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/09 02:15:21.539, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 13 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/09 02:15:21.540, Tesla V100-SXM2-32GB, 470.57.02, 19 %, 11 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/09 02:15:21.540, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 18 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/09 02:15:21.541, Tesla V100-SXM2-32GB, 470.57.02, 71 %, 21 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/09 02:15:21.542, Tesla V100-SXM2-32GB, 470.57.02, 19 %, 11 %, 32510 MiB, 29984 MiB, 2526 MiB 2022/04/09 02:15:21.542, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 8 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/09 02:15:21.547, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 8 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/09 02:15:21.548, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 18 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/09 02:15:21.549, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/09 02:15:21.549, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 8 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/09 02:15:21.550, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29960 MiB, 2550 MiB 2022/04/09 02:15:21.551, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/09 02:15:21.555, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/09 02:15:21.556, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 8 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/09 02:15:21.557, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 13 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/09 02:15:21.557, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/09 02:15:21.558, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 13 %, 32510 MiB, 29968 MiB, 2542 MiB 2022/04/09 02:15:21.564, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/09 02:15:21.565, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/09 02:15:21.566, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29900 MiB, 2610 MiB 2022/04/09 02:15:21.575, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 8 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/09 02:15:21.576, Tesla V100-SXM2-32GB, 470.57.02, 34 %, 8 %, 32510 MiB, 29844 MiB, 2666 MiB 2022/04/09 02:15:21.584, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB 2022/04/09 02:15:21.585, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29924 MiB, 2586 MiB [rank:0] iter: 200/1100, loss: 0.4701324701309204, latency(ms): 42.8989387676119804 | 2022-04-09 02:15:25.642 [rank:0] iter: 300/1100, loss: 0.4608173370361328, latency(ms): 40.2331069484353065 | 2022-04-09 02:15:29.665 [rank:0] iter: 400/1100, loss: 0.4522465169429779, latency(ms): 41.4584445208311081 | 2022-04-09 02:15:33.811 [rank:0] iter: 500/1100, loss: 0.4505920410156250, latency(ms): 41.6010596975684166 | 2022-04-09 02:15:37.971 [rank:0] iter: 600/1100, loss: 0.4465112686157227, latency(ms): 42.2634643316268921 | 2022-04-09 02:15:42.197 [rank:0] iter: 700/1100, loss: 0.4401841461658478, latency(ms): 42.7751135453581810 | 2022-04-09 02:15:46.475 [rank:0] iter: 800/1100, loss: 0.4423094689846039, latency(ms): 41.5613636747002602 | 2022-04-09 02:15:50.631 [rank:0] iter: 900/1100, loss: 0.4328640103340149, latency(ms): 43.1671634316444397 | 2022-04-09 02:15:54.948 [rank:0] iter: 1000/1100, loss: 0.4314532876014709, latency(ms): 42.9910354688763618 | 2022-04-09 02:15:59.247 [rank:0] iter: 1100/1100, loss: 0.4328012764453888, latency(ms): 43.4901724383234978 | 2022-04-09 02:16:03.596