loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loaded library: loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1 W20220409 02:12:35.417598 3606 rpc_client.cpp:190] LoadServer 10.7.102.248 Failed at 0 times error_code 14 error_message failed to connect to all addresses ------------------------ arguments ------------------------ batch_size ...................................... 131072 batch_size_per_proc ............................. 16384 data_dir ........................................ /dataset/f9f659c5/wdl_ofrecord data_part_name_suffix_length .................... 5 data_part_num ................................... 256 dataset_format .................................. ofrecord ddp ............................................. False deep_dropout_rate ............................... 0.5 deep_embedding_vec_size ......................... 16 deep_vocab_size ................................. 2322444 eval_after_training ............................. False eval_batchs ..................................... 20 eval_interval ................................... 0 execution_mode .................................. graph hidden_size ..................................... 1024 hidden_units_num ................................ 2 learning_rate ................................... 0.001 loss_print_every_n_iter ......................... 100 max_iter ........................................ 1100 model_load_dir .................................. model_save_dir .................................. ./checkpoint num_deep_sparse_fields .......................... 26 num_dense_fields ................................ 13 num_wide_sparse_fields .......................... 2 save_initial_model .............................. False save_model_after_each_eval ...................... False test_name ....................................... noname_test wide_vocab_size ................................. 2322444 -------------------- end of arguments --------------------- [rank:0] iter: 100/1100, loss: 0.0619377195835114, latency(ms): 222.8664024174213409 | 2022-04-09 02:13:09.349 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:13:09.508, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 24 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.509, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 24 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.510, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 24 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.512, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 24 %, 32510 MiB, 29711 MiB, 2799 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:13:09.515, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 24 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.516, Tesla V100-SXM2-32GB, 470.57.02, 90 %, 37 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.517, Tesla V100-SXM2-32GB, 470.57.02, 90 %, 37 %, 32510 MiB, 29711 MiB, 2799 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:13:09.518, Tesla V100-SXM2-32GB, 470.57.02, 90 %, 37 %, 32510 MiB, 29711 MiB, 2799 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:13:09.519, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.520, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 24 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.522, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.523, Tesla V100-SXM2-32GB, 470.57.02, 69 %, 29 %, 32510 MiB, 29885 MiB, 2625 MiB 2022/04/09 02:13:09.524, Tesla V100-SXM2-32GB, 470.57.02, 69 %, 29 %, 32510 MiB, 29885 MiB, 2625 MiB 2022/04/09 02:13:09.525, Tesla V100-SXM2-32GB, 470.57.02, 69 %, 29 %, 32510 MiB, 29885 MiB, 2625 MiB 2022/04/09 02:13:09.525, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 24 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.526, Tesla V100-SXM2-32GB, 470.57.02, 56 %, 24 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.527, Tesla V100-SXM2-32GB, 470.57.02, 69 %, 29 %, 32510 MiB, 29885 MiB, 2625 MiB 2022/04/09 02:13:09.529, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.531, Tesla V100-SXM2-32GB, 470.57.02, 69 %, 29 %, 32510 MiB, 29885 MiB, 2625 MiB 2022/04/09 02:13:09.531, Tesla V100-SXM2-32GB, 470.57.02, 29 %, 13 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.532, Tesla V100-SXM2-32GB, 470.57.02, 29 %, 13 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.534, Tesla V100-SXM2-32GB, 470.57.02, 29 %, 13 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.535, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.536, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29711 MiB, 2799 MiB 2022/04/09 02:13:09.536, Tesla V100-SXM2-32GB, 470.57.02, 29 %, 13 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.538, Tesla V100-SXM2-32GB, 470.57.02, 69 %, 29 %, 32510 MiB, 29885 MiB, 2625 MiB 2022/04/09 02:13:09.540, Tesla V100-SXM2-32GB, 470.57.02, 29 %, 13 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.540, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.541, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.543, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.543, Tesla V100-SXM2-32GB, 470.57.02, 69 %, 29 %, 32510 MiB, 29885 MiB, 2625 MiB 2022/04/09 02:13:09.544, Tesla V100-SXM2-32GB, 470.57.02, 69 %, 29 %, 32510 MiB, 29885 MiB, 2625 MiB 2022/04/09 02:13:09.545, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.547, Tesla V100-SXM2-32GB, 470.57.02, 29 %, 13 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.549, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.549, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29719 MiB, 2791 MiB 2022/04/09 02:13:09.550, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29719 MiB, 2791 MiB 2022/04/09 02:13:09.551, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29719 MiB, 2791 MiB 2022/04/09 02:13:09.552, Tesla V100-SXM2-32GB, 470.57.02, 29 %, 13 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.553, Tesla V100-SXM2-32GB, 470.57.02, 29 %, 13 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.553, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29719 MiB, 2791 MiB 2022/04/09 02:13:09.555, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.557, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29719 MiB, 2791 MiB 2022/04/09 02:13:09.557, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29591 MiB, 2919 MiB 2022/04/09 02:13:09.558, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29591 MiB, 2919 MiB 2022/04/09 02:13:09.560, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29591 MiB, 2919 MiB 2022/04/09 02:13:09.561, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.562, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29909 MiB, 2601 MiB 2022/04/09 02:13:09.562, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29591 MiB, 2919 MiB 2022/04/09 02:13:09.564, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29719 MiB, 2791 MiB 2022/04/09 02:13:09.566, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29591 MiB, 2919 MiB 2022/04/09 02:13:09.566, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29829 MiB, 2681 MiB 2022/04/09 02:13:09.567, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29829 MiB, 2681 MiB 2022/04/09 02:13:09.569, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29829 MiB, 2681 MiB 2022/04/09 02:13:09.569, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29719 MiB, 2791 MiB 2022/04/09 02:13:09.570, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29719 MiB, 2791 MiB 2022/04/09 02:13:09.571, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29829 MiB, 2681 MiB 2022/04/09 02:13:09.573, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29591 MiB, 2919 MiB 2022/04/09 02:13:09.574, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29829 MiB, 2681 MiB 2022/04/09 02:13:09.578, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29591 MiB, 2919 MiB 2022/04/09 02:13:09.579, Tesla V100-SXM2-32GB, 470.57.02, 0 %, 0 %, 32510 MiB, 29591 MiB, 2919 MiB 2022/04/09 02:13:09.581, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29829 MiB, 2681 MiB 2022/04/09 02:13:09.586, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29829 MiB, 2681 MiB 2022/04/09 02:13:09.587, Tesla V100-SXM2-32GB, 470.57.02, 15 %, 6 %, 32510 MiB, 29829 MiB, 2681 MiB [rank:0] iter: 200/1100, loss: 0.0581710152328014, latency(ms): 24.0075412020087242 | 2022-04-09 02:13:11.750 [rank:0] iter: 300/1100, loss: 0.0573150143027306, latency(ms): 21.7154230549931526 | 2022-04-09 02:13:13.921 [rank:0] iter: 400/1100, loss: 0.0562784783542156, latency(ms): 21.3266766071319580 | 2022-04-09 02:13:16.054 [rank:0] iter: 500/1100, loss: 0.0559964850544930, latency(ms): 21.5898210555315018 | 2022-04-09 02:13:18.213 [rank:0] iter: 600/1100, loss: 0.0555732920765877, latency(ms): 21.4204753562808037 | 2022-04-09 02:13:20.355 [rank:0] iter: 700/1100, loss: 0.0546715408563614, latency(ms): 21.6418742388486862 | 2022-04-09 02:13:22.519 [rank:0] iter: 800/1100, loss: 0.0547450259327888, latency(ms): 21.6884928196668625 | 2022-04-09 02:13:24.688 [rank:0] iter: 900/1100, loss: 0.0537478961050510, latency(ms): 21.3465739041566849 | 2022-04-09 02:13:26.823 [rank:0] iter: 1000/1100, loss: 0.0533513054251671, latency(ms): 20.9593136236071587 | 2022-04-09 02:13:28.919 [rank:0] iter: 1100/1100, loss: 0.0534333661198616, latency(ms): 21.4084838703274727 | 2022-04-09 02:13:31.060