loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loaded library: loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 W20220415 15:26:47.469053 5230 rpc_client.cpp:190] LoadServer 10.7.250.92 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220415 15:26:47.467670 5227 rpc_client.cpp:190] LoadServer 10.7.250.92 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220415 15:26:47.469203 5228 rpc_client.cpp:190] LoadServer 10.7.250.92 Failed at 0 times error_code 14 error_message failed to connect to all addresses ------------------------ arguments ------------------------ batch_size ...................................... 512 batch_size_per_proc ............................. 64 data_dir ........................................ /dataset/f9f659c5/wdl_ofrecord data_part_name_suffix_length .................... 5 data_part_num ................................... 256 dataset_format .................................. ofrecord ddp ............................................. True deep_dropout_rate ............................... 0.5 deep_embedding_vec_size ......................... 16 deep_vocab_size ................................. 2322444 eval_after_training ............................. False eval_batchs ..................................... 20 eval_interval ................................... 0 execution_mode .................................. eager hidden_size ..................................... 1024 hidden_units_num ................................ 2 learning_rate ................................... 0.001 loss_print_every_n_iter ......................... 100 max_iter ........................................ 1100 model_load_dir .................................. model_save_dir .................................. ./checkpoint num_deep_sparse_fields .......................... 26 num_dense_fields ................................ 13 num_wide_sparse_fields .......................... 2 save_initial_model .............................. False save_model_after_each_eval ...................... False test_name ....................................... noname_test wide_vocab_size ................................. 2322444 -------------------- end of arguments --------------------- [rank:0] iter: 100/1100, loss: 0.5352382659912109, latency(ms): 65.7503592595458031 | 2022-04-15 15:27:05.669 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/15 15:27:06.503, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30344 MiB, 2166 MiB 2022/04/15 15:27:06.506, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30364 MiB, 2146 MiB 2022/04/15 15:27:06.507, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30440 MiB, 2070 MiB 2022/04/15 15:27:06.508, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30416 MiB, 2094 MiB 2022/04/15 15:27:06.509, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30424 MiB, 2086 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/15 15:27:06.510, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30356 MiB, 2154 MiB 2022/04/15 15:27:06.511, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30344 MiB, 2166 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/15 15:27:06.512, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30300 MiB, 2210 MiB 2022/04/15 15:27:06.513, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30364 MiB, 2146 MiB 2022/04/15 15:27:06.513, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30344 MiB, 2166 MiB 2022/04/15 15:27:06.514, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30380 MiB, 2130 MiB 2022/04/15 15:27:06.515, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30440 MiB, 2070 MiB 2022/04/15 15:27:06.516, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30364 MiB, 2146 MiB 2022/04/15 15:27:06.518, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30416 MiB, 2094 MiB 2022/04/15 15:27:06.519, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30440 MiB, 2070 MiB 2022/04/15 15:27:06.520, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30424 MiB, 2086 MiB 2022/04/15 15:27:06.521, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30416 MiB, 2094 MiB 2022/04/15 15:27:06.523, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30356 MiB, 2154 MiB 2022/04/15 15:27:06.524, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30424 MiB, 2086 MiB 2022/04/15 15:27:06.525, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30300 MiB, 2210 MiB 2022/04/15 15:27:06.528, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30356 MiB, 2154 MiB 2022/04/15 15:27:06.529, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30380 MiB, 2130 MiB 2022/04/15 15:27:06.529, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30300 MiB, 2210 MiB 2022/04/15 15:27:06.531, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30380 MiB, 2130 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/15 15:27:08.110, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30344 MiB, 2166 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/15 15:27:08.114, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30364 MiB, 2146 MiB 2022/04/15 15:27:08.115, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30344 MiB, 2166 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/15 15:27:08.116, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30344 MiB, 2166 MiB 2022/04/15 15:27:08.117, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30440 MiB, 2070 MiB 2022/04/15 15:27:08.119, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30364 MiB, 2146 MiB 2022/04/15 15:27:08.119, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30344 MiB, 2166 MiB 2022/04/15 15:27:08.121, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30364 MiB, 2146 MiB 2022/04/15 15:27:08.121, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30344 MiB, 2166 MiB 2022/04/15 15:27:08.122, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30416 MiB, 2094 MiB 2022/04/15 15:27:08.124, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30440 MiB, 2070 MiB 2022/04/15 15:27:08.125, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30364 MiB, 2146 MiB 2022/04/15 15:27:08.126, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30440 MiB, 2070 MiB 2022/04/15 15:27:08.127, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30364 MiB, 2146 MiB 2022/04/15 15:27:08.128, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30424 MiB, 2086 MiB 2022/04/15 15:27:08.129, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30416 MiB, 2094 MiB 2022/04/15 15:27:08.131, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30440 MiB, 2070 MiB 2022/04/15 15:27:08.132, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30416 MiB, 2094 MiB 2022/04/15 15:27:08.132, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30440 MiB, 2070 MiB 2022/04/15 15:27:08.133, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30356 MiB, 2154 MiB 2022/04/15 15:27:08.135, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30424 MiB, 2086 MiB 2022/04/15 15:27:08.136, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30416 MiB, 2094 MiB 2022/04/15 15:27:08.137, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30424 MiB, 2086 MiB 2022/04/15 15:27:08.138, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30416 MiB, 2094 MiB 2022/04/15 15:27:08.139, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30300 MiB, 2210 MiB 2022/04/15 15:27:08.141, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30356 MiB, 2154 MiB 2022/04/15 15:27:08.142, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30424 MiB, 2086 MiB 2022/04/15 15:27:08.143, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30356 MiB, 2154 MiB 2022/04/15 15:27:08.144, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30424 MiB, 2086 MiB 2022/04/15 15:27:08.144, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30380 MiB, 2130 MiB 2022/04/15 15:27:08.146, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30300 MiB, 2210 MiB 2022/04/15 15:27:08.148, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30356 MiB, 2154 MiB 2022/04/15 15:27:08.149, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30300 MiB, 2210 MiB 2022/04/15 15:27:08.149, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30356 MiB, 2154 MiB 2022/04/15 15:27:08.152, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30380 MiB, 2130 MiB 2022/04/15 15:27:08.154, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30300 MiB, 2210 MiB 2022/04/15 15:27:08.155, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30380 MiB, 2130 MiB 2022/04/15 15:27:08.155, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30300 MiB, 2210 MiB 2022/04/15 15:27:08.159, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30380 MiB, 2130 MiB 2022/04/15 15:27:08.161, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 0 %, 32510 MiB, 30380 MiB, 2130 MiB [rank:0] iter: 200/1100, loss: 0.4899774789810181, latency(ms): 31.2268365919589996 | 2022-04-15 15:27:08.791 [rank:0] iter: 300/1100, loss: 0.4929096698760986, latency(ms): 6.4278894662857056 | 2022-04-15 15:27:09.434 [rank:0] iter: 400/1100, loss: 0.4930160045623779, latency(ms): 6.0348400846123695 | 2022-04-15 15:27:10.038 [rank:0] iter: 500/1100, loss: 0.4942871928215027, latency(ms): 6.3802062347531319 | 2022-04-15 15:27:10.676 [rank:0] iter: 600/1100, loss: 0.4840598404407501, latency(ms): 6.1174964159727097 | 2022-04-15 15:27:11.287 [rank:0] iter: 700/1100, loss: 0.4703110456466675, latency(ms): 6.2575904279947281 | 2022-04-15 15:27:11.913 [rank:0] iter: 800/1100, loss: 0.4816550016403198, latency(ms): 6.4163897931575775 | 2022-04-15 15:27:12.555 [rank:0] iter: 900/1100, loss: 0.4919722676277161, latency(ms): 6.0248934850096703 | 2022-04-15 15:27:13.157 [rank:0] iter: 1000/1100, loss: 0.4790087640285492, latency(ms): 6.2031962722539902 | 2022-04-15 15:27:13.778 [rank:0] iter: 1100/1100, loss: 0.4751987159252167, latency(ms): 6.2838402390480042 | 2022-04-15 15:27:14.406