loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 W20220512 02:38:52.495620 400 rpc_client.cpp:190] LoadServer 10.7.151.28 Failed at 0 times error_code 14 error_message failed to connect to all addresses ------------------------ arguments ------------------------ batch_size ...................................... 512 batch_size_per_proc ............................. 64 data_dir ........................................ /dataset/f9f659c5/wdl_ofrecord data_part_name_suffix_length .................... 5 data_part_num ................................... 256 dataset_format .................................. ofrecord ddp ............................................. False deep_dropout_rate ............................... 0.5 deep_embedding_vec_size ......................... 16 deep_vocab_size ................................. 2322444 eval_after_training ............................. False eval_batchs ..................................... 20 eval_interval ................................... 0 execution_mode .................................. graph hidden_size ..................................... 1024 hidden_units_num ................................ 2 learning_rate ................................... 0.001 loss_print_every_n_iter ......................... 100 max_iter ........................................ 1100 model_load_dir .................................. model_save_dir .................................. ./checkpoint num_deep_sparse_fields .......................... 26 num_dense_fields ................................ 13 num_wide_sparse_fields .......................... 2 save_initial_model .............................. False save_model_after_each_eval ...................... False test_name ....................................... noname_test wide_vocab_size ................................. 2322444 -------------------- end of arguments --------------------- [rank:0] iter: 100/1100, loss: 0.0667183995246887, latency(ms): 174.5538168400526047 | 2022-05-12 02:39:21.552 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/05/12 02:39:21.749, Tesla V100-SXM2-32GB, 470.57.02, 12 %, 0 %, 32510 MiB, 30427 MiB, 2083 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/05/12 02:39:21.757, Tesla V100-SXM2-32GB, 470.57.02, 53 %, 4 %, 32510 MiB, 30447 MiB, 2063 MiB 2022/05/12 02:39:21.760, Tesla V100-SXM2-32GB, 470.57.02, 12 %, 0 %, 32510 MiB, 30427 MiB, 2083 MiB 2022/05/12 02:39:21.761, Tesla V100-SXM2-32GB, 470.57.02, 12 %, 0 %, 32510 MiB, 30427 MiB, 2083 MiB 2022/05/12 02:39:21.762, Tesla V100-SXM2-32GB, 470.57.02, 12 %, 0 %, 32510 MiB, 30427 MiB, 2083 MiB 2022/05/12 02:39:21.765, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30599 MiB, 1911 MiB 2022/05/12 02:39:21.771, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30447 MiB, 2063 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/05/12 02:39:21.772, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30447 MiB, 2063 MiB 2022/05/12 02:39:21.773, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30447 MiB, 2063 MiB 2022/05/12 02:39:21.775, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30551 MiB, 1959 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/05/12 02:39:21.780, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30599 MiB, 1911 MiB 2022/05/12 02:39:21.781, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30599 MiB, 1911 MiB 2022/05/12 02:39:21.780, Tesla V100-SXM2-32GB, 470.57.02, 12 %, 0 %, 32510 MiB, 30427 MiB, 2083 MiB 2022/05/12 02:39:21.781, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30599 MiB, 1911 MiB 2022/05/12 02:39:21.783, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 0 %, 32510 MiB, 30567 MiB, 1943 MiB 2022/05/12 02:39:21.783, Tesla V100-SXM2-32GB, 470.57.02, 12 %, 0 %, 32510 MiB, 30427 MiB, 2083 MiB 2022/05/12 02:39:21.784, Tesla V100-SXM2-32GB, 470.57.02, 12 %, 0 %, 32510 MiB, 30427 MiB, 2083 MiB 2022/05/12 02:39:21.785, Tesla V100-SXM2-32GB, 470.57.02, 12 %, 0 %, 32510 MiB, 30427 MiB, 2083 MiB 2022/05/12 02:39:21.788, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30551 MiB, 1959 MiB 2022/05/12 02:39:21.790, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30551 MiB, 1959 MiB 2022/05/12 02:39:21.790, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30447 MiB, 2063 MiB 2022/05/12 02:39:21.790, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30551 MiB, 1959 MiB 2022/05/12 02:39:21.793, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30431 MiB, 2079 MiB 2022/05/12 02:39:21.794, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30447 MiB, 2063 MiB 2022/05/12 02:39:21.796, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30447 MiB, 2063 MiB 2022/05/12 02:39:21.797, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30447 MiB, 2063 MiB 2022/05/12 02:39:21.799, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 0 %, 32510 MiB, 30567 MiB, 1943 MiB 2022/05/12 02:39:21.800, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 0 %, 32510 MiB, 30567 MiB, 1943 MiB 2022/05/12 02:39:21.801, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30599 MiB, 1911 MiB 2022/05/12 02:39:21.801, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 0 %, 32510 MiB, 30567 MiB, 1943 MiB 2022/05/12 02:39:21.804, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30319 MiB, 2191 MiB 2022/05/12 02:39:21.805, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30599 MiB, 1911 MiB 2022/05/12 02:39:21.806, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30599 MiB, 1911 MiB 2022/05/12 02:39:21.808, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30599 MiB, 1911 MiB 2022/05/12 02:39:21.810, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30431 MiB, 2079 MiB 2022/05/12 02:39:21.811, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30431 MiB, 2079 MiB 2022/05/12 02:39:21.812, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30551 MiB, 1959 MiB 2022/05/12 02:39:21.812, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30431 MiB, 2079 MiB 2022/05/12 02:39:21.815, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30479 MiB, 2031 MiB 2022/05/12 02:39:21.816, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30551 MiB, 1959 MiB 2022/05/12 02:39:21.817, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30551 MiB, 1959 MiB 2022/05/12 02:39:21.819, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30551 MiB, 1959 MiB 2022/05/12 02:39:21.821, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30319 MiB, 2191 MiB 2022/05/12 02:39:21.822, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30319 MiB, 2191 MiB 2022/05/12 02:39:21.823, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 0 %, 32510 MiB, 30567 MiB, 1943 MiB 2022/05/12 02:39:21.823, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30319 MiB, 2191 MiB 2022/05/12 02:39:21.827, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 0 %, 32510 MiB, 30567 MiB, 1943 MiB 2022/05/12 02:39:21.828, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 0 %, 32510 MiB, 30567 MiB, 1943 MiB 2022/05/12 02:39:21.830, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 0 %, 32510 MiB, 30567 MiB, 1943 MiB 2022/05/12 02:39:21.832, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30479 MiB, 2031 MiB 2022/05/12 02:39:21.833, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30479 MiB, 2031 MiB 2022/05/12 02:39:21.833, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30431 MiB, 2079 MiB 2022/05/12 02:39:21.834, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30479 MiB, 2031 MiB 2022/05/12 02:39:21.838, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30431 MiB, 2079 MiB 2022/05/12 02:39:21.839, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30431 MiB, 2079 MiB 2022/05/12 02:39:21.840, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30431 MiB, 2079 MiB 2022/05/12 02:39:21.844, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30319 MiB, 2191 MiB 2022/05/12 02:39:21.848, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30319 MiB, 2191 MiB 2022/05/12 02:39:21.849, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30319 MiB, 2191 MiB 2022/05/12 02:39:21.850, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30319 MiB, 2191 MiB 2022/05/12 02:39:21.853, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30479 MiB, 2031 MiB 2022/05/12 02:39:21.857, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30479 MiB, 2031 MiB 2022/05/12 02:39:21.858, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30479 MiB, 2031 MiB 2022/05/12 02:39:21.859, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 30479 MiB, 2031 MiB [rank:0] iter: 200/1100, loss: 0.0609392598271370, latency(ms): 6.4892443269491196 | 2022-05-12 02:39:22.200 [rank:0] iter: 300/1100, loss: 0.0625722482800484, latency(ms): 2.9626622796058655 | 2022-05-12 02:39:22.497 [rank:0] iter: 400/1100, loss: 0.0614476315677166, latency(ms): 3.1264939531683922 | 2022-05-12 02:39:22.809 [rank:0] iter: 500/1100, loss: 0.0621252730488777, latency(ms): 2.8321730345487595 | 2022-05-12 02:39:23.093 [rank:0] iter: 600/1100, loss: 0.0603549443185329, latency(ms): 3.1331041827797890 | 2022-05-12 02:39:23.406 [rank:0] iter: 700/1100, loss: 0.0592311732470989, latency(ms): 2.9008218273520470 | 2022-05-12 02:39:23.696 [rank:0] iter: 800/1100, loss: 0.0607971698045731, latency(ms): 3.2189615443348885 | 2022-05-12 02:39:24.018 [rank:0] iter: 900/1100, loss: 0.0606326088309288, latency(ms): 3.2711185142397881 | 2022-05-12 02:39:24.345 [rank:0] iter: 1000/1100, loss: 0.0599992722272873, latency(ms): 2.6286374032497406 | 2022-05-12 02:39:24.608 [rank:0] iter: 1100/1100, loss: 0.0602875240147114, latency(ms): 3.1259224936366081 | 2022-05-12 02:39:24.920