loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loaded library: loaded library: loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 W20220429 02:18:15.751116 3607 rpc_client.cpp:190] LoadServer 10.7.88.56 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220429 02:18:15.752171 3605 rpc_client.cpp:190] LoadServer 10.7.88.56 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220429 02:18:15.752058 3608 rpc_client.cpp:190] LoadServer 10.7.88.56 Failed at 0 times error_code 14 error_message failed to connect to all addresses ------------------------ arguments ------------------------ batch_size ...................................... 131072 batch_size_per_proc ............................. 16384 data_dir ........................................ /dataset/f9f659c5/wdl_ofrecord data_part_name_suffix_length .................... 5 data_part_num ................................... 256 dataset_format .................................. ofrecord ddp ............................................. False deep_dropout_rate ............................... 0.5 deep_embedding_vec_size ......................... 16 deep_vocab_size ................................. 2322444 eval_after_training ............................. False eval_batchs ..................................... 20 eval_interval ................................... 0 execution_mode .................................. graph hidden_size ..................................... 1024 hidden_units_num ................................ 2 learning_rate ................................... 0.001 loss_print_every_n_iter ......................... 100 max_iter ........................................ 1100 model_load_dir .................................. model_save_dir .................................. ./checkpoint num_deep_sparse_fields .......................... 26 num_dense_fields ................................ 13 num_wide_sparse_fields .......................... 2 save_initial_model .............................. False save_model_after_each_eval ...................... False test_name ....................................... noname_test wide_vocab_size ................................. 2322444 -------------------- end of arguments --------------------- [rank:0] iter: 100/1100, loss: 0.0618331320583820, latency(ms): 193.8548646494746208 | 2022-04-29 02:18:46.731 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/29 02:18:46.864, Tesla V100-SXM2-32GB, 470.57.02, 72 %, 29 %, 32510 MiB, 29709 MiB, 2801 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/29 02:18:46.868, Tesla V100-SXM2-32GB, 470.57.02, 43 %, 18 %, 32510 MiB, 29665 MiB, 2845 MiB 2022/04/29 02:18:46.868, Tesla V100-SXM2-32GB, 470.57.02, 72 %, 29 %, 32510 MiB, 29709 MiB, 2801 MiB 2022/04/29 02:18:46.870, Tesla V100-SXM2-32GB, 470.57.02, 72 %, 29 %, 32510 MiB, 29709 MiB, 2801 MiB 2022/04/29 02:18:46.873, Tesla V100-SXM2-32GB, 470.57.02, 98 %, 40 %, 32510 MiB, 29881 MiB, 2629 MiB 2022/04/29 02:18:46.874, Tesla V100-SXM2-32GB, 470.57.02, 43 %, 18 %, 32510 MiB, 29665 MiB, 2845 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/29 02:18:46.878, Tesla V100-SXM2-32GB, 470.57.02, 43 %, 18 %, 32510 MiB, 29665 MiB, 2845 MiB 2022/04/29 02:18:46.881, Tesla V100-SXM2-32GB, 470.57.02, 88 %, 36 %, 32510 MiB, 29833 MiB, 2677 MiB 2022/04/29 02:18:46.882, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 3 %, 32510 MiB, 29881 MiB, 2629 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/29 02:18:46.886, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 3 %, 32510 MiB, 29881 MiB, 2629 MiB 2022/04/29 02:18:46.886, Tesla V100-SXM2-32GB, 470.57.02, 72 %, 29 %, 32510 MiB, 29709 MiB, 2801 MiB 2022/04/29 02:18:46.889, Tesla V100-SXM2-32GB, 470.57.02, 67 %, 29 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/04/29 02:18:46.891, Tesla V100-SXM2-32GB, 470.57.02, 88 %, 36 %, 32510 MiB, 29833 MiB, 2677 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/29 02:18:46.895, Tesla V100-SXM2-32GB, 470.57.02, 88 %, 36 %, 32510 MiB, 29833 MiB, 2677 MiB 2022/04/29 02:18:46.894, Tesla V100-SXM2-32GB, 470.57.02, 72 %, 29 %, 32510 MiB, 29709 MiB, 2801 MiB 2022/04/29 02:18:46.896, Tesla V100-SXM2-32GB, 470.57.02, 43 %, 18 %, 32510 MiB, 29665 MiB, 2845 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/29 02:18:46.898, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/04/29 02:18:46.899, Tesla V100-SXM2-32GB, 470.57.02, 67 %, 29 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/04/29 02:18:46.902, Tesla V100-SXM2-32GB, 470.57.02, 67 %, 29 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/04/29 02:18:46.902, Tesla V100-SXM2-32GB, 470.57.02, 72 %, 29 %, 32510 MiB, 29709 MiB, 2801 MiB 2022/04/29 02:18:46.903, Tesla V100-SXM2-32GB, 470.57.02, 43 %, 18 %, 32510 MiB, 29665 MiB, 2845 MiB 2022/04/29 02:18:46.903, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 3 %, 32510 MiB, 29881 MiB, 2629 MiB 2022/04/29 02:18:46.904, Tesla V100-SXM2-32GB, 470.57.02, 72 %, 29 %, 32510 MiB, 29709 MiB, 2801 MiB 2022/04/29 02:18:46.904, Tesla V100-SXM2-32GB, 470.57.02, 72 %, 29 %, 32510 MiB, 29709 MiB, 2801 MiB 2022/04/29 02:18:46.905, Tesla V100-SXM2-32GB, 470.57.02, 24 %, 11 %, 32510 MiB, 29537 MiB, 2973 MiB 2022/04/29 02:18:46.907, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/04/29 02:18:46.912, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/04/29 02:18:46.913, Tesla V100-SXM2-32GB, 470.57.02, 43 %, 18 %, 32510 MiB, 29665 MiB, 2845 MiB 2022/04/29 02:18:46.913, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 3 %, 32510 MiB, 29881 MiB, 2629 MiB 2022/04/29 02:18:46.913, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29833 MiB, 2677 MiB 2022/04/29 02:18:46.915, Tesla V100-SXM2-32GB, 470.57.02, 43 %, 18 %, 32510 MiB, 29665 MiB, 2845 MiB 2022/04/29 02:18:46.915, Tesla V100-SXM2-32GB, 470.57.02, 43 %, 18 %, 32510 MiB, 29665 MiB, 2845 MiB 2022/04/29 02:18:46.916, Tesla V100-SXM2-32GB, 470.57.02, 59 %, 25 %, 32510 MiB, 29697 MiB, 2813 MiB 2022/04/29 02:18:46.917, Tesla V100-SXM2-32GB, 470.57.02, 24 %, 11 %, 32510 MiB, 29537 MiB, 2973 MiB 2022/04/29 02:18:46.923, Tesla V100-SXM2-32GB, 470.57.02, 24 %, 11 %, 32510 MiB, 29537 MiB, 2973 MiB 2022/04/29 02:18:46.924, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 3 %, 32510 MiB, 29881 MiB, 2629 MiB 2022/04/29 02:18:46.924, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29833 MiB, 2677 MiB 2022/04/29 02:18:46.924, Tesla V100-SXM2-32GB, 470.57.02, 67 %, 29 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/04/29 02:18:46.926, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 3 %, 32510 MiB, 29881 MiB, 2629 MiB 2022/04/29 02:18:46.926, Tesla V100-SXM2-32GB, 470.57.02, 6 %, 3 %, 32510 MiB, 29881 MiB, 2629 MiB 2022/04/29 02:18:46.928, Tesla V100-SXM2-32GB, 470.57.02, 59 %, 25 %, 32510 MiB, 29697 MiB, 2813 MiB 2022/04/29 02:18:46.933, Tesla V100-SXM2-32GB, 470.57.02, 59 %, 25 %, 32510 MiB, 29697 MiB, 2813 MiB 2022/04/29 02:18:46.934, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29833 MiB, 2677 MiB 2022/04/29 02:18:46.934, Tesla V100-SXM2-32GB, 470.57.02, 67 %, 29 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/04/29 02:18:46.934, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/04/29 02:18:46.936, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29833 MiB, 2677 MiB 2022/04/29 02:18:46.937, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29833 MiB, 2677 MiB 2022/04/29 02:18:46.945, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/04/29 02:18:46.945, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/04/29 02:18:46.945, Tesla V100-SXM2-32GB, 470.57.02, 24 %, 11 %, 32510 MiB, 29537 MiB, 2973 MiB 2022/04/29 02:18:46.946, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/04/29 02:18:46.947, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29785 MiB, 2725 MiB 2022/04/29 02:18:46.954, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/04/29 02:18:46.954, Tesla V100-SXM2-32GB, 470.57.02, 24 %, 11 %, 32510 MiB, 29537 MiB, 2973 MiB 2022/04/29 02:18:46.954, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29697 MiB, 2813 MiB 2022/04/29 02:18:46.955, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/04/29 02:18:46.956, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29649 MiB, 2861 MiB 2022/04/29 02:18:46.962, Tesla V100-SXM2-32GB, 470.57.02, 24 %, 11 %, 32510 MiB, 29537 MiB, 2973 MiB 2022/04/29 02:18:46.962, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29697 MiB, 2813 MiB 2022/04/29 02:18:46.969, Tesla V100-SXM2-32GB, 470.57.02, 24 %, 11 %, 32510 MiB, 29537 MiB, 2973 MiB 2022/04/29 02:18:46.969, Tesla V100-SXM2-32GB, 470.57.02, 24 %, 11 %, 32510 MiB, 29537 MiB, 2973 MiB 2022/04/29 02:18:46.976, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29697 MiB, 2813 MiB 2022/04/29 02:18:46.978, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29697 MiB, 2813 MiB 2022/04/29 02:18:46.982, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29697 MiB, 2813 MiB [rank:0] iter: 200/1100, loss: 0.0581160746514797, latency(ms): 23.8802395388484001 | 2022-04-29 02:18:49.119 [rank:0] iter: 300/1100, loss: 0.0572532229125500, latency(ms): 21.4691192656755447 | 2022-04-29 02:18:51.266 [rank:0] iter: 400/1100, loss: 0.0562291704118252, latency(ms): 22.2772282734513283 | 2022-04-29 02:18:53.494 [rank:0] iter: 500/1100, loss: 0.0559641160070896, latency(ms): 21.9757009297609329 | 2022-04-29 02:18:55.692 [rank:0] iter: 600/1100, loss: 0.0555189996957779, latency(ms): 21.9271464645862579 | 2022-04-29 02:18:57.884 [rank:0] iter: 700/1100, loss: 0.0546068586409092, latency(ms): 21.6222833842039108 | 2022-04-29 02:19:00.046 [rank:0] iter: 800/1100, loss: 0.0547159053385258, latency(ms): 21.8664956465363503 | 2022-04-29 02:19:02.233 [rank:0] iter: 900/1100, loss: 0.0536865517497063, latency(ms): 21.3637054339051247 | 2022-04-29 02:19:04.370 [rank:0] iter: 1000/1100, loss: 0.0532917864620686, latency(ms): 21.2974664941430092 | 2022-04-29 02:19:06.499 [rank:0] iter: 1100/1100, loss: 0.0533656515181065, latency(ms): 21.4030868932604790 | 2022-04-29 02:19:08.640