loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loaded library: loaded library: loaded library: loaded library: loaded library: loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 W20220510 02:37:58.650792 3606 rpc_client.cpp:190] LoadServer 10.7.187.155 Failed at 0 times error_code 14 error_message failed to connect to all addresses ------------------------ arguments ------------------------ batch_size ...................................... 131072 batch_size_per_proc ............................. 16384 data_dir ........................................ /dataset/f9f659c5/wdl_ofrecord data_part_name_suffix_length .................... 5 data_part_num ................................... 256 dataset_format .................................. ofrecord ddp ............................................. False deep_dropout_rate ............................... 0.5 deep_embedding_vec_size ......................... 16 deep_vocab_size ................................. 2322444 eval_after_training ............................. False eval_batchs ..................................... 20 eval_interval ................................... 0 execution_mode .................................. graph hidden_size ..................................... 1024 hidden_units_num ................................ 2 learning_rate ................................... 0.001 loss_print_every_n_iter ......................... 100 max_iter ........................................ 1100 model_load_dir .................................. model_save_dir .................................. ./checkpoint num_deep_sparse_fields .......................... 26 num_dense_fields ................................ 13 num_wide_sparse_fields .......................... 2 save_initial_model .............................. False save_model_after_each_eval ...................... False test_name ....................................... noname_test wide_vocab_size ................................. 2322444 -------------------- end of arguments --------------------- [rank:0] iter: 100/1100, loss: 0.0618466846644878, latency(ms): 190.6551012396812439 | 2022-05-10 02:38:29.667 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/05/10 02:38:29.840, Tesla V100-SXM2-32GB, 470.57.02, 52 %, 22 %, 32510 MiB, 29645 MiB, 2865 MiB 2022/05/10 02:38:29.841, Tesla V100-SXM2-32GB, 470.57.02, 52 %, 22 %, 32510 MiB, 29645 MiB, 2865 MiB 2022/05/10 02:38:29.843, Tesla V100-SXM2-32GB, 470.57.02, 52 %, 22 %, 32510 MiB, 29645 MiB, 2865 MiB 2022/05/10 02:38:29.844, Tesla V100-SXM2-32GB, 470.57.02, 52 %, 22 %, 32510 MiB, 29645 MiB, 2865 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/05/10 02:38:29.849, Tesla V100-SXM2-32GB, 470.57.02, 32 %, 14 %, 32510 MiB, 29665 MiB, 2845 MiB 2022/05/10 02:38:29.850, Tesla V100-SXM2-32GB, 470.57.02, 32 %, 14 %, 32510 MiB, 29665 MiB, 2845 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/05/10 02:38:29.852, Tesla V100-SXM2-32GB, 470.57.02, 32 %, 14 %, 32510 MiB, 29665 MiB, 2845 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/05/10 02:38:29.853, Tesla V100-SXM2-32GB, 470.57.02, 32 %, 14 %, 32510 MiB, 29665 MiB, 2845 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/05/10 02:38:29.854, Tesla V100-SXM2-32GB, 470.57.02, 52 %, 22 %, 32510 MiB, 29645 MiB, 2865 MiB 2022/05/10 02:38:29.856, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 25 %, 32510 MiB, 29817 MiB, 2693 MiB 2022/05/10 02:38:29.858, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 25 %, 32510 MiB, 29817 MiB, 2693 MiB 2022/05/10 02:38:29.858, Tesla V100-SXM2-32GB, 470.57.02, 52 %, 22 %, 32510 MiB, 29645 MiB, 2865 MiB 2022/05/10 02:38:29.860, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 25 %, 32510 MiB, 29817 MiB, 2693 MiB 2022/05/10 02:38:29.861, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 25 %, 32510 MiB, 29817 MiB, 2693 MiB 2022/05/10 02:38:29.861, Tesla V100-SXM2-32GB, 470.57.02, 52 %, 22 %, 32510 MiB, 29645 MiB, 2865 MiB 2022/05/10 02:38:29.863, Tesla V100-SXM2-32GB, 470.57.02, 32 %, 14 %, 32510 MiB, 29665 MiB, 2845 MiB 2022/05/10 02:38:29.862, Tesla V100-SXM2-32GB, 470.57.02, 52 %, 22 %, 32510 MiB, 29645 MiB, 2865 MiB 2022/05/10 02:38:29.865, Tesla V100-SXM2-32GB, 470.57.02, 58 %, 24 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/05/10 02:38:29.867, Tesla V100-SXM2-32GB, 470.57.02, 58 %, 24 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/05/10 02:38:29.868, Tesla V100-SXM2-32GB, 470.57.02, 32 %, 14 %, 32510 MiB, 29665 MiB, 2845 MiB 2022/05/10 02:38:29.869, Tesla V100-SXM2-32GB, 470.57.02, 58 %, 24 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/05/10 02:38:29.871, Tesla V100-SXM2-32GB, 470.57.02, 58 %, 24 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/05/10 02:38:29.872, Tesla V100-SXM2-32GB, 470.57.02, 32 %, 14 %, 32510 MiB, 29665 MiB, 2845 MiB 2022/05/10 02:38:29.873, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 25 %, 32510 MiB, 29817 MiB, 2693 MiB 2022/05/10 02:38:29.873, Tesla V100-SXM2-32GB, 470.57.02, 32 %, 14 %, 32510 MiB, 29665 MiB, 2845 MiB 2022/05/10 02:38:29.875, Tesla V100-SXM2-32GB, 470.57.02, 18 %, 8 %, 32510 MiB, 29849 MiB, 2661 MiB 2022/05/10 02:38:29.877, Tesla V100-SXM2-32GB, 470.57.02, 18 %, 8 %, 32510 MiB, 29849 MiB, 2661 MiB 2022/05/10 02:38:29.878, Tesla V100-SXM2-32GB, 470.57.02, 60 %, 25 %, 32510 MiB, 29817 MiB, 2693 MiB 2022/05/10 02:38:29.879, Tesla V100-SXM2-32GB, 470.57.02, 18 %, 8 %, 32510 MiB, 29849 MiB, 2661 MiB 2022/05/10 02:38:29.881, Tesla V100-SXM2-32GB, 470.57.02, 18 %, 8 %, 32510 MiB, 29849 MiB, 2661 MiB 2022/05/10 02:38:29.882, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29817 MiB, 2693 MiB 2022/05/10 02:38:29.883, Tesla V100-SXM2-32GB, 470.57.02, 58 %, 24 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/05/10 02:38:29.884, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29817 MiB, 2693 MiB 2022/05/10 02:38:29.885, Tesla V100-SXM2-32GB, 470.57.02, 44 %, 19 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/05/10 02:38:29.887, Tesla V100-SXM2-32GB, 470.57.02, 44 %, 19 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/05/10 02:38:29.889, Tesla V100-SXM2-32GB, 470.57.02, 58 %, 24 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/05/10 02:38:29.890, Tesla V100-SXM2-32GB, 470.57.02, 44 %, 19 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/05/10 02:38:29.891, Tesla V100-SXM2-32GB, 470.57.02, 44 %, 19 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/05/10 02:38:29.892, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/05/10 02:38:29.894, Tesla V100-SXM2-32GB, 470.57.02, 18 %, 8 %, 32510 MiB, 29849 MiB, 2661 MiB 2022/05/10 02:38:29.894, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/05/10 02:38:29.896, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29601 MiB, 2909 MiB 2022/05/10 02:38:29.898, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29601 MiB, 2909 MiB 2022/05/10 02:38:29.899, Tesla V100-SXM2-32GB, 470.57.02, 18 %, 8 %, 32510 MiB, 29849 MiB, 2661 MiB 2022/05/10 02:38:29.900, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29601 MiB, 2909 MiB 2022/05/10 02:38:29.902, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29601 MiB, 2909 MiB 2022/05/10 02:38:29.903, Tesla V100-SXM2-32GB, 470.57.02, 18 %, 8 %, 32510 MiB, 29849 MiB, 2661 MiB 2022/05/10 02:38:29.905, Tesla V100-SXM2-32GB, 470.57.02, 44 %, 19 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/05/10 02:38:29.905, Tesla V100-SXM2-32GB, 470.57.02, 18 %, 8 %, 32510 MiB, 29849 MiB, 2661 MiB 2022/05/10 02:38:29.906, Tesla V100-SXM2-32GB, 470.57.02, 14 %, 6 %, 32510 MiB, 29761 MiB, 2749 MiB 2022/05/10 02:38:29.909, Tesla V100-SXM2-32GB, 470.57.02, 14 %, 6 %, 32510 MiB, 29761 MiB, 2749 MiB 2022/05/10 02:38:29.910, Tesla V100-SXM2-32GB, 470.57.02, 44 %, 19 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/05/10 02:38:29.911, Tesla V100-SXM2-32GB, 470.57.02, 14 %, 6 %, 32510 MiB, 29761 MiB, 2749 MiB 2022/05/10 02:38:29.913, Tesla V100-SXM2-32GB, 470.57.02, 14 %, 6 %, 32510 MiB, 29761 MiB, 2749 MiB 2022/05/10 02:38:29.914, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/05/10 02:38:29.915, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29601 MiB, 2909 MiB 2022/05/10 02:38:29.916, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/05/10 02:38:29.920, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29601 MiB, 2909 MiB 2022/05/10 02:38:29.924, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29601 MiB, 2909 MiB 2022/05/10 02:38:29.926, Tesla V100-SXM2-32GB, 470.57.02, 14 %, 6 %, 32510 MiB, 29761 MiB, 2749 MiB 2022/05/10 02:38:29.926, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29601 MiB, 2909 MiB 2022/05/10 02:38:29.930, Tesla V100-SXM2-32GB, 470.57.02, 14 %, 6 %, 32510 MiB, 29761 MiB, 2749 MiB 2022/05/10 02:38:29.933, Tesla V100-SXM2-32GB, 470.57.02, 14 %, 6 %, 32510 MiB, 29761 MiB, 2749 MiB 2022/05/10 02:38:29.935, Tesla V100-SXM2-32GB, 470.57.02, 14 %, 6 %, 32510 MiB, 29761 MiB, 2749 MiB [rank:0] iter: 200/1100, loss: 0.0581096708774567, latency(ms): 23.8343472406268120 | 2022-05-10 02:38:32.051 [rank:0] iter: 300/1100, loss: 0.0572909079492092, latency(ms): 21.2286418303847313 | 2022-05-10 02:38:34.174 [rank:0] iter: 400/1100, loss: 0.0562483742833138, latency(ms): 20.8992061391472816 | 2022-05-10 02:38:36.263 [rank:0] iter: 500/1100, loss: 0.0559450015425682, latency(ms): 21.3953620567917824 | 2022-05-10 02:38:38.403 [rank:0] iter: 600/1100, loss: 0.0555594079196453, latency(ms): 21.0009237751364708 | 2022-05-10 02:38:40.503 [rank:0] iter: 700/1100, loss: 0.0546239055693150, latency(ms): 21.3770780712366104 | 2022-05-10 02:38:42.641 [rank:0] iter: 800/1100, loss: 0.0547492727637291, latency(ms): 20.9083883836865425 | 2022-05-10 02:38:44.732 [rank:0] iter: 900/1100, loss: 0.0537480153143406, latency(ms): 20.8455853909254074 | 2022-05-10 02:38:46.816 [rank:0] iter: 1000/1100, loss: 0.0533173754811287, latency(ms): 20.9170536696910858 | 2022-05-10 02:38:48.908 [rank:0] iter: 1100/1100, loss: 0.0533905476331711, latency(ms): 20.6362099573016167 | 2022-05-10 02:38:50.972