loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: loaded library: loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 W20220421 02:16:13.756004 6892 rpc_client.cpp:190] LoadServer 10.7.179.227 Failed at 0 times error_code 14 error_message failed to connect to all addresses ------------------------ arguments ------------------------ batches_per_epoch ............................... 625 channel_last .................................... False ddp ............................................. True exit_num ........................................ 300 fuse_bn_add_relu ................................ False fuse_bn_relu .................................... False gpu_stat_file ................................... None grad_clipping ................................... 0.0 graph ........................................... False label_smoothing ................................. 0.1 learning_rate ................................... 2.048 legacy_init ..................................... False load_path ....................................... None lr_decay_type ................................... cosine metric_local .................................... True metric_train_acc ................................ True momentum ........................................ 0.875 nccl_fusion_max_ops ............................. 24 nccl_fusion_threshold_mb ........................ 16 num_classes ..................................... 1000 num_devices_per_node ............................ 8 num_epochs ...................................... 1 num_nodes ....................................... 1 ofrecord_part_num ............................... 256 ofrecord_path ................................... /dataset/79846248 print_interval .................................. 100 print_timestamp ................................. False samples_per_epoch ............................... 1281167 save_init ....................................... False save_path ....................................... None scale_grad ...................................... False skip_eval ....................................... True synthetic_data .................................. False total_batches ................................... -1 train_batch_size ................................ 256 train_global_batch_size ......................... 2048 use_fp16 ........................................ False use_gpu_decode .................................. False val_batch_size .................................. 50 val_batches_per_epoch ........................... 125 val_global_batch_size ........................... 400 val_samples_per_epoch ........................... 50000 warmup_epochs ................................... 5 weight_decay .................................... 3.0517578125e-05 zero_init_residual .............................. True -------------------- end of arguments --------------------- ***** Model Init ***** ***** Model Init Finish, time escapled: 2.70145 s ***** [rank:6] [train], epoch: 0/1, iter: 100/625, loss: 0.86702, lr: 0.000000, top1: 0.00098, throughput: 277.28 | 2022-04-21 02:18:01.079 [rank:1] [train], epoch: 0/1, iter: 100/625, loss: 0.86693, lr: 0.000000, top1: 0.00062, throughput: 277.11 | 2022-04-21 02:18:01.088 [rank:4] [train], epoch: 0/1, iter: 100/625, loss: 0.86708, lr: 0.000000, top1: 0.00102, throughput: 277.37 | 2022-04-21 02:18:01.090 [rank:3] [train], epoch: 0/1, iter: 100/625, loss: 0.86719, lr: 0.000000, top1: 0.00066, throughput: 277.22 | 2022-04-21 02:18:01.103 [rank:0] [train], epoch: 0/1, iter: 100/625, loss: 0.86706, lr: 0.000000, top1: 0.00070, throughput: 277.19 | 2022-04-21 02:18:01.107 [rank:7] [train], epoch: 0/1, iter: 100/625, loss: 0.86711, lr: 0.000000, top1: 0.00086, throughput: 277.04 | 2022-04-21 02:18:01.107 [rank:2] [train], epoch: 0/1, iter: 100/625, loss: 0.86730, lr: 0.000000, top1: 0.00078, throughput: 276.52 | 2022-04-21 02:18:01.110 [rank:5] [train], epoch: 0/1, iter: 100/625, loss: 0.86700, lr: 0.000000, top1: 0.00078, throughput: 277.20 | 2022-04-21 02:18:01.173 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/21 02:18:01.246, Tesla V100-SXM2-32GB, 470.57.02, 48 %, 31 %, 32510 MiB, 5100 MiB, 27410 MiB 2022/04/21 02:18:01.247, Tesla V100-SXM2-32GB, 470.57.02, 48 %, 31 %, 32510 MiB, 5100 MiB, 27410 MiB 2022/04/21 02:18:01.254, Tesla V100-SXM2-32GB, 470.57.02, 97 %, 40 %, 32510 MiB, 5120 MiB, 27390 MiB 2022/04/21 02:18:01.254, Tesla V100-SXM2-32GB, 470.57.02, 97 %, 40 %, 32510 MiB, 5120 MiB, 27390 MiB 2022/04/21 02:18:01.258, Tesla V100-SXM2-32GB, 470.57.02, 70 %, 24 %, 32510 MiB, 5196 MiB, 27314 MiB 2022/04/21 02:18:01.258, Tesla V100-SXM2-32GB, 470.57.02, 70 %, 24 %, 32510 MiB, 5196 MiB, 27314 MiB 2022/04/21 02:18:01.262, Tesla V100-SXM2-32GB, 470.57.02, 39 %, 19 %, 32510 MiB, 5172 MiB, 27338 MiB 2022/04/21 02:18:01.262, Tesla V100-SXM2-32GB, 470.57.02, 39 %, 19 %, 32510 MiB, 5172 MiB, 27338 MiB 2022/04/21 02:18:01.269, Tesla V100-SXM2-32GB, 470.57.02, 83 %, 34 %, 32510 MiB, 5180 MiB, 27330 MiB 2022/04/21 02:18:01.269, Tesla V100-SXM2-32GB, 470.57.02, 83 %, 34 %, 32510 MiB, 5180 MiB, 27330 MiB 2022/04/21 02:18:01.276, Tesla V100-SXM2-32GB, 470.57.02, 11 %, 0 %, 32510 MiB, 5112 MiB, 27398 MiB 2022/04/21 02:18:01.276, Tesla V100-SXM2-32GB, 470.57.02, 11 %, 0 %, 32510 MiB, 5112 MiB, 27398 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/21 02:18:01.281, Tesla V100-SXM2-32GB, 470.57.02, 97 %, 42 %, 32510 MiB, 5056 MiB, 27454 MiB 2022/04/21 02:18:01.281, Tesla V100-SXM2-32GB, 470.57.02, 97 %, 42 %, 32510 MiB, 5056 MiB, 27454 MiB 2022/04/21 02:18:01.287, Tesla V100-SXM2-32GB, 470.57.02, 88 %, 48 %, 32510 MiB, 5100 MiB, 27410 MiB 2022/04/21 02:18:01.288, Tesla V100-SXM2-32GB, 470.57.02, 67 %, 13 %, 32510 MiB, 5136 MiB, 27374 MiB 2022/04/21 02:18:01.289, Tesla V100-SXM2-32GB, 470.57.02, 67 %, 13 %, 32510 MiB, 5136 MiB, 27374 MiB 2022/04/21 02:18:01.296, Tesla V100-SXM2-32GB, 470.57.02, 97 %, 40 %, 32510 MiB, 5120 MiB, 27390 MiB 2022/04/21 02:18:01.303, Tesla V100-SXM2-32GB, 470.57.02, 70 %, 24 %, 32510 MiB, 5196 MiB, 27314 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/21 02:18:01.309, Tesla V100-SXM2-32GB, 470.57.02, 95 %, 52 %, 32510 MiB, 5172 MiB, 27338 MiB 2022/04/21 02:18:01.316, Tesla V100-SXM2-32GB, 470.57.02, 88 %, 47 %, 32510 MiB, 5100 MiB, 27410 MiB 2022/04/21 02:18:01.321, Tesla V100-SXM2-32GB, 470.57.02, 83 %, 34 %, 32510 MiB, 5180 MiB, 27330 MiB 2022/04/21 02:18:01.327, Tesla V100-SXM2-32GB, 470.57.02, 97 %, 40 %, 32510 MiB, 5120 MiB, 27390 MiB 2022/04/21 02:18:01.341, Tesla V100-SXM2-32GB, 470.57.02, 11 %, 0 %, 32510 MiB, 5112 MiB, 27398 MiB 2022/04/21 02:18:01.348, Tesla V100-SXM2-32GB, 470.57.02, 70 %, 24 %, 32510 MiB, 5196 MiB, 27314 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/21 02:18:01.381, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 79 %, 32510 MiB, 5056 MiB, 27454 MiB 2022/04/21 02:18:01.382, Tesla V100-SXM2-32GB, 470.57.02, 95 %, 49 %, 32510 MiB, 5172 MiB, 27338 MiB 2022/04/21 02:18:01.408, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 76 %, 32510 MiB, 5100 MiB, 27410 MiB 2022/04/21 02:18:01.439, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 85 %, 32510 MiB, 5136 MiB, 27374 MiB 2022/04/21 02:18:01.442, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 81 %, 32510 MiB, 5180 MiB, 27330 MiB 2022/04/21 02:18:01.447, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 76 %, 32510 MiB, 5120 MiB, 27390 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/21 02:18:01.456, Tesla V100-SXM2-32GB, 470.57.02, 78 %, 68 %, 32510 MiB, 5112 MiB, 27398 MiB 2022/04/21 02:18:01.461, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 86 %, 32510 MiB, 5196 MiB, 27314 MiB 2022/04/21 02:18:01.472, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 76 %, 32510 MiB, 5100 MiB, 27410 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/21 02:18:01.477, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 79 %, 32510 MiB, 5056 MiB, 27454 MiB 2022/04/21 02:18:01.481, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 75 %, 32510 MiB, 5172 MiB, 27338 MiB 2022/04/21 02:18:01.487, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 76 %, 32510 MiB, 5120 MiB, 27390 MiB 2022/04/21 02:18:01.487, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 76 %, 32510 MiB, 5100 MiB, 27410 MiB 2022/04/21 02:18:01.495, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 85 %, 32510 MiB, 5136 MiB, 27374 MiB 2022/04/21 02:18:01.499, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 81 %, 32510 MiB, 5180 MiB, 27330 MiB 2022/04/21 02:18:01.500, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 86 %, 32510 MiB, 5196 MiB, 27314 MiB 2022/04/21 02:18:01.501, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 76 %, 32510 MiB, 5120 MiB, 27390 MiB 2022/04/21 02:18:01.516, Tesla V100-SXM2-32GB, 470.57.02, 78 %, 68 %, 32510 MiB, 5112 MiB, 27398 MiB 2022/04/21 02:18:01.518, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 75 %, 32510 MiB, 5172 MiB, 27338 MiB 2022/04/21 02:18:01.518, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 86 %, 32510 MiB, 5196 MiB, 27314 MiB 2022/04/21 02:18:01.522, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 79 %, 32510 MiB, 5056 MiB, 27454 MiB 2022/04/21 02:18:01.526, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 81 %, 32510 MiB, 5180 MiB, 27330 MiB 2022/04/21 02:18:01.527, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 75 %, 32510 MiB, 5172 MiB, 27338 MiB 2022/04/21 02:18:01.537, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 85 %, 32510 MiB, 5136 MiB, 27374 MiB 2022/04/21 02:18:01.538, Tesla V100-SXM2-32GB, 470.57.02, 78 %, 68 %, 32510 MiB, 5112 MiB, 27398 MiB 2022/04/21 02:18:01.539, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 81 %, 32510 MiB, 5180 MiB, 27330 MiB 2022/04/21 02:18:01.546, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 79 %, 32510 MiB, 5056 MiB, 27454 MiB 2022/04/21 02:18:01.558, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 68 %, 32510 MiB, 5112 MiB, 27398 MiB 2022/04/21 02:18:01.562, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 50 %, 32510 MiB, 5136 MiB, 27374 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/21 02:18:01.563, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 45 %, 32510 MiB, 5056 MiB, 27454 MiB 2022/04/21 02:18:01.588, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 43 %, 32510 MiB, 5100 MiB, 27410 MiB 2022/04/21 02:18:01.589, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 50 %, 32510 MiB, 5136 MiB, 27374 MiB 2022/04/21 02:18:01.604, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 43 %, 32510 MiB, 5120 MiB, 27390 MiB 2022/04/21 02:18:01.611, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 46 %, 32510 MiB, 5196 MiB, 27314 MiB 2022/04/21 02:18:01.615, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 75 %, 32510 MiB, 5172 MiB, 27338 MiB 2022/04/21 02:18:01.620, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 45 %, 32510 MiB, 5180 MiB, 27330 MiB 2022/04/21 02:18:01.625, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 68 %, 32510 MiB, 5112 MiB, 27398 MiB 2022/04/21 02:18:01.627, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 45 %, 32510 MiB, 5056 MiB, 27454 MiB 2022/04/21 02:18:01.634, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 50 %, 32510 MiB, 5136 MiB, 27374 MiB [rank:3] [train], epoch: 0/1, iter: 200/625, loss: 0.86721, lr: 0.000000, top1: 0.00109, throughput: 295.79 | 2022-04-21 02:19:27.650 [rank:0] [train], epoch: 0/1, iter: 200/625, loss: 0.86721, lr: 0.000000, top1: 0.00098, throughput: 295.79 | 2022-04-21 02:19:27.653 [rank:4] [train], epoch: 0/1, iter: 200/625, loss: 0.86722, lr: 0.000000, top1: 0.00062, throughput: 295.67 | 2022-04-21 02:19:27.674 [rank:2] [train], epoch: 0/1, iter: 200/625, loss: 0.86725, lr: 0.000000, top1: 0.00125, throughput: 295.70 | 2022-04-21 02:19:27.685 [rank:1] [train], epoch: 0/1, iter: 200/625, loss: 0.86707, lr: 0.000000, top1: 0.00098, throughput: 295.61 | 2022-04-21 02:19:27.689 [rank:5] [train], epoch: 0/1, iter: 200/625, loss: 0.86690, lr: 0.000000, top1: 0.00066, throughput: 295.88 | 2022-04-21 02:19:27.694 [rank:7] [train], epoch: 0/1, iter: 200/625, loss: 0.86691, lr: 0.000000, top1: 0.00102, throughput: 295.65 | 2022-04-21 02:19:27.695 [rank:6] [train], epoch: 0/1, iter: 200/625, loss: 0.86730, lr: 0.000000, top1: 0.00090, throughput: 295.42 | 2022-04-21 02:19:27.735 [rank:6] [train], epoch: 0/1, iter: 300/625, loss: 0.86713, lr: 0.000000, top1: 0.00098, throughput: 297.42 | 2022-04-21 02:20:53.807 [rank:5] [train], epoch: 0/1, iter: 300/625, loss: 0.86730, lr: 0.000000, top1: 0.00066, throughput: 297.24 | 2022-04-21 02:20:53.818 [rank:7] [train], epoch: 0/1, iter: 300/625, loss: 0.86702, lr: 0.000000, top1: 0.00109, throughput: 297.23 | 2022-04-21 02:20:53.825 [rank:0] [train], epoch: 0/1, iter: 300/625, loss: 0.86709, lr: 0.000000, top1: 0.00078, throughput: 296.95 | 2022-04-21 02:20:53.865 [rank:1] [train], epoch: 0/1, iter: 300/625, loss: 0.86709, lr: 0.000000, top1: 0.00078, throughput: 297.04 | 2022-04-21 02:20:53.871 [rank:4] [train], epoch: 0/1, iter: 300/625, loss: 0.86705, lr: 0.000000, top1: 0.00090, throughput: 296.96 | 2022-04-21 02:20:53.880 [rank:3] [train], epoch: 0/1, iter: 300/625, loss: 0.86707, lr: 0.000000, top1: 0.00125, throughput: 296.88 | 2022-04-21 02:20:53.882 [rank:2] [train], epoch: 0/1, iter: 300/625, loss: 0.86700, lr: 0.000000, top1: 0.00082, throughput: 297.27 | 2022-04-21 02:20:53.803