loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loaded library: loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 W20220413 02:06:46.343760 5240 rpc_client.cpp:190] LoadServer 10.7.0.64 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220413 02:06:46.343725 5238 rpc_client.cpp:190] LoadServer 10.7.0.64 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220413 02:06:46.342818 5239 rpc_client.cpp:190] LoadServer 10.7.0.64 Failed at 0 times error_code 14 error_message failed to connect to all addresses ------------------------ arguments ------------------------ batch_size ...................................... 512 batch_size_per_proc ............................. 64 data_dir ........................................ /dataset/f9f659c5/wdl_ofrecord data_part_name_suffix_length .................... 5 data_part_num ................................... 256 dataset_format .................................. ofrecord ddp ............................................. True deep_dropout_rate ............................... 0.5 deep_embedding_vec_size ......................... 16 deep_vocab_size ................................. 2322444 eval_after_training ............................. False eval_batchs ..................................... 20 eval_interval ................................... 0 execution_mode .................................. eager hidden_size ..................................... 1024 hidden_units_num ................................ 2 learning_rate ................................... 0.001 loss_print_every_n_iter ......................... 100 max_iter ........................................ 1100 model_load_dir .................................. model_save_dir .................................. ./checkpoint num_deep_sparse_fields .......................... 26 num_dense_fields ................................ 13 num_wide_sparse_fields .......................... 2 save_initial_model .............................. False save_model_after_each_eval ...................... False test_name ....................................... noname_test wide_vocab_size ................................. 2322444 -------------------- end of arguments --------------------- [rank:0] iter: 100/1100, loss: 0.5354669094085693, latency(ms): 75.6573206931352615 | 2022-04-13 02:07:05.282 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/13 02:07:05.473, Tesla V100-SXM2-32GB, 470.57.02, 66 %, 58 %, 32510 MiB, 30376 MiB, 2134 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/13 02:07:05.480, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30396 MiB, 2114 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/13 02:07:05.488, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30472 MiB, 2038 MiB 2022/04/13 02:07:05.487, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30376 MiB, 2134 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/13 02:07:05.491, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30376 MiB, 2134 MiB 2022/04/13 02:07:05.495, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30448 MiB, 2062 MiB 2022/04/13 02:07:05.496, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30396 MiB, 2114 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/13 02:07:05.496, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30376 MiB, 2134 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/13 02:07:05.500, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30396 MiB, 2114 MiB 2022/04/13 02:07:05.499, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30376 MiB, 2134 MiB 2022/04/13 02:07:05.502, Tesla V100-SXM2-32GB, 470.57.02, 33 %, 29 %, 32510 MiB, 30456 MiB, 2054 MiB 2022/04/13 02:07:05.502, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30472 MiB, 2038 MiB 2022/04/13 02:07:05.504, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30396 MiB, 2114 MiB 2022/04/13 02:07:05.503, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30376 MiB, 2134 MiB 2022/04/13 02:07:05.504, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30376 MiB, 2134 MiB 2022/04/13 02:07:05.507, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30472 MiB, 2038 MiB 2022/04/13 02:07:05.506, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30376 MiB, 2134 MiB 2022/04/13 02:07:05.508, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30396 MiB, 2114 MiB 2022/04/13 02:07:05.510, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30388 MiB, 2122 MiB 2022/04/13 02:07:05.511, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30448 MiB, 2062 MiB 2022/04/13 02:07:05.513, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30472 MiB, 2038 MiB 2022/04/13 02:07:05.513, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30396 MiB, 2114 MiB 2022/04/13 02:07:05.515, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30396 MiB, 2114 MiB 2022/04/13 02:07:05.517, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30448 MiB, 2062 MiB 2022/04/13 02:07:05.517, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30396 MiB, 2114 MiB 2022/04/13 02:07:05.518, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30472 MiB, 2038 MiB 2022/04/13 02:07:05.520, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30332 MiB, 2178 MiB 2022/04/13 02:07:05.521, Tesla V100-SXM2-32GB, 470.57.02, 33 %, 29 %, 32510 MiB, 30456 MiB, 2054 MiB 2022/04/13 02:07:05.522, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30448 MiB, 2062 MiB 2022/04/13 02:07:05.523, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30472 MiB, 2038 MiB 2022/04/13 02:07:05.524, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30472 MiB, 2038 MiB 2022/04/13 02:07:05.526, Tesla V100-SXM2-32GB, 470.57.02, 33 %, 29 %, 32510 MiB, 30456 MiB, 2054 MiB 2022/04/13 02:07:05.527, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30472 MiB, 2038 MiB 2022/04/13 02:07:05.527, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30448 MiB, 2062 MiB 2022/04/13 02:07:05.530, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 5 %, 32510 MiB, 30412 MiB, 2098 MiB 2022/04/13 02:07:05.531, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30388 MiB, 2122 MiB 2022/04/13 02:07:05.532, Tesla V100-SXM2-32GB, 470.57.02, 33 %, 29 %, 32510 MiB, 30456 MiB, 2054 MiB 2022/04/13 02:07:05.533, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30448 MiB, 2062 MiB 2022/04/13 02:07:05.534, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30448 MiB, 2062 MiB 2022/04/13 02:07:05.536, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30388 MiB, 2122 MiB 2022/04/13 02:07:05.537, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30448 MiB, 2062 MiB 2022/04/13 02:07:05.537, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30456 MiB, 2054 MiB 2022/04/13 02:07:05.540, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30332 MiB, 2178 MiB 2022/04/13 02:07:05.542, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30388 MiB, 2122 MiB 2022/04/13 02:07:05.543, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30456 MiB, 2054 MiB 2022/04/13 02:07:05.544, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30456 MiB, 2054 MiB 2022/04/13 02:07:05.546, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30332 MiB, 2178 MiB 2022/04/13 02:07:05.547, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30456 MiB, 2054 MiB 2022/04/13 02:07:05.547, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30388 MiB, 2122 MiB 2022/04/13 02:07:05.550, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 5 %, 32510 MiB, 30412 MiB, 2098 MiB 2022/04/13 02:07:05.552, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30332 MiB, 2178 MiB 2022/04/13 02:07:05.553, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30388 MiB, 2122 MiB 2022/04/13 02:07:05.554, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30388 MiB, 2122 MiB 2022/04/13 02:07:05.556, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 5 %, 32510 MiB, 30412 MiB, 2098 MiB 2022/04/13 02:07:05.556, Tesla V100-SXM2-32GB, 470.57.02, 1 %, 1 %, 32510 MiB, 30388 MiB, 2122 MiB 2022/04/13 02:07:05.557, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30332 MiB, 2178 MiB 2022/04/13 02:07:05.562, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 5 %, 32510 MiB, 30412 MiB, 2098 MiB 2022/04/13 02:07:05.563, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30332 MiB, 2178 MiB 2022/04/13 02:07:05.564, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30332 MiB, 2178 MiB 2022/04/13 02:07:05.566, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30332 MiB, 2178 MiB 2022/04/13 02:07:05.567, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 5 %, 32510 MiB, 30412 MiB, 2098 MiB 2022/04/13 02:07:05.572, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 5 %, 32510 MiB, 30412 MiB, 2098 MiB 2022/04/13 02:07:05.574, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 5 %, 32510 MiB, 30412 MiB, 2098 MiB 2022/04/13 02:07:05.576, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 5 %, 32510 MiB, 30412 MiB, 2098 MiB [rank:0] iter: 200/1100, loss: 0.4828409850597382, latency(ms): 9.3839001283049583 | 2022-04-13 02:07:06.221 [rank:0] iter: 300/1100, loss: 0.5004616975784302, latency(ms): 6.5795617178082466 | 2022-04-13 02:07:06.879 [rank:0] iter: 400/1100, loss: 0.4932110309600830, latency(ms): 6.5245559811592102 | 2022-04-13 02:07:07.531 [rank:0] iter: 500/1100, loss: 0.4957191944122314, latency(ms): 6.0526554286479950 | 2022-04-13 02:07:08.137 [rank:0] iter: 600/1100, loss: 0.4764851331710815, latency(ms): 6.0344545915722847 | 2022-04-13 02:07:08.740 [rank:0] iter: 700/1100, loss: 0.4760075509548187, latency(ms): 6.2122776731848717 | 2022-04-13 02:07:09.361 [rank:0] iter: 800/1100, loss: 0.4814506769180298, latency(ms): 6.0103785246610641 | 2022-04-13 02:07:09.962 [rank:0] iter: 900/1100, loss: 0.4838238060474396, latency(ms): 6.2260261178016663 | 2022-04-13 02:07:10.585 [rank:0] iter: 1000/1100, loss: 0.4824652969837189, latency(ms): 6.0869899764657021 | 2022-04-13 02:07:11.194 [rank:0] iter: 1100/1100, loss: 0.4794503450393677, latency(ms): 6.2189916893839836 | 2022-04-13 02:07:11.815