loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loaded library: loaded library: loaded library: loaded library: loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 loaded library: loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1/usr/lib/x86_64-linux-gnu/libibverbs.so.1 W20220409 01:58:42.500550 5267 rpc_client.cpp:190] LoadServer 10.7.208.28 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220409 01:58:42.503685 5268 rpc_client.cpp:190] LoadServer 10.7.208.28 Failed at 0 times error_code 14 error_message failed to connect to all addresses W20220409 01:58:42.506376 5266 rpc_client.cpp:190] LoadServer 10.7.208.28 Failed at 0 times error_code 14 error_message failed to connect to all addresses ------------------------ arguments ------------------------ batches_per_epoch ............................... 625 channel_last .................................... False ddp ............................................. False exit_num ........................................ 300 fuse_bn_add_relu ................................ True fuse_bn_relu .................................... True gpu_stat_file ................................... None grad_clipping ................................... 0.0 graph ........................................... True label_smoothing ................................. 0.1 learning_rate ................................... 2.048 legacy_init ..................................... False load_path ....................................... None lr_decay_type ................................... cosine metric_local .................................... True metric_train_acc ................................ True momentum ........................................ 0.875 nccl_fusion_max_ops ............................. 24 nccl_fusion_threshold_mb ........................ 16 num_classes ..................................... 1000 num_devices_per_node ............................ 8 num_epochs ...................................... 1 num_nodes ....................................... 1 ofrecord_part_num ............................... 256 ofrecord_path ................................... /dataset/79846248 print_interval .................................. 100 print_timestamp ................................. False samples_per_epoch ............................... 1281167 save_init ....................................... False save_path ....................................... None scale_grad ...................................... True skip_eval ....................................... True synthetic_data .................................. False total_batches ................................... -1 train_batch_size ................................ 256 train_global_batch_size ......................... 2048 use_fp16 ........................................ False use_gpu_decode .................................. False val_batch_size .................................. 50 val_batches_per_epoch ........................... 125 val_global_batch_size ........................... 400 val_samples_per_epoch ........................... 50000 warmup_epochs ................................... 5 weight_decay .................................... 3.0517578125e-05 zero_init_residual .............................. True -------------------- end of arguments --------------------- ***** Model Init ***** ***** Model Init Finish, time escapled: 2.93442 s ***** [rank:5] [train], epoch: 0/1, iter: 100/625, loss: 0.86777, top1: 0.00090, throughput: 272.37 | 2022-04-09 02:00:31.837 [rank:4] [train], epoch: 0/1, iter: 100/625, loss: 0.86763, top1: 0.00078, throughput: 272.38 | 2022-04-09 02:00:31.838 [rank:7] [train], epoch: 0/1, iter: 100/625, loss: 0.86775, top1: 0.00105, throughput: 272.34 | 2022-04-09 02:00:31.843 [rank:3] [train], epoch: 0/1, iter: 100/625, loss: 0.86766, top1: 0.00102, throughput: 272.37 | 2022-04-09 02:00:31.845 [rank:6] [train], epoch: 0/1, iter: 100/625, loss: 0.86751, top1: 0.00102, throughput: 272.36 | 2022-04-09 02:00:31.838 [rank:2] [train], epoch: 0/1, iter: 100/625, loss: 0.86735, top1: 0.00109, throughput: 272.35 | 2022-04-09 02:00:31.842 [rank:0] [train], epoch: 0/1, iter: 100/625, loss: 0.86750, top1: 0.00113, throughput: 272.34 | 2022-04-09 02:00:31.843 [rank:1] [train], epoch: 0/1, iter: 100/625, loss: 0.86758, top1: 0.00109, throughput: 272.34 | 2022-04-09 02:00:31.844 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:00:32.147, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 56 %, 32510 MiB, 8424 MiB, 24086 MiB 2022/04/09 02:00:32.157, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 56 %, 32510 MiB, 8484 MiB, 24026 MiB 2022/04/09 02:00:32.166, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 57 %, 32510 MiB, 8550 MiB, 23960 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:00:32.173, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 56 %, 32510 MiB, 8590 MiB, 23920 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:00:32.177, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 56 %, 32510 MiB, 8424 MiB, 24086 MiB 2022/04/09 02:00:32.177, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 56 %, 32510 MiB, 8424 MiB, 24086 MiB 2022/04/09 02:00:32.190, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 61 %, 32510 MiB, 8530 MiB, 23980 MiB 2022/04/09 02:00:32.190, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 56 %, 32510 MiB, 8424 MiB, 24086 MiB 2022/04/09 02:00:32.190, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 56 %, 32510 MiB, 8424 MiB, 24086 MiB 2022/04/09 02:00:32.192, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8484 MiB, 24026 MiB 2022/04/09 02:00:32.193, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8484 MiB, 24026 MiB timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/09 02:00:32.200, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 61 %, 32510 MiB, 8414 MiB, 24096 MiB 2022/04/09 02:00:32.200, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8484 MiB, 24026 MiB 2022/04/09 02:00:32.201, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8484 MiB, 24026 MiB 2022/04/09 02:00:32.201, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8550 MiB, 23960 MiB 2022/04/09 02:00:32.209, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8550 MiB, 23960 MiB 2022/04/09 02:00:32.209, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8424 MiB, 24086 MiB 2022/04/09 02:00:32.209, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8424 MiB, 24086 MiB 2022/04/09 02:00:32.210, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8424 MiB, 24086 MiB 2022/04/09 02:00:32.214, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 62 %, 32510 MiB, 8272 MiB, 24238 MiB 2022/04/09 02:00:32.214, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8550 MiB, 23960 MiB 2022/04/09 02:00:32.217, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8550 MiB, 23960 MiB 2022/04/09 02:00:32.218, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8590 MiB, 23920 MiB 2022/04/09 02:00:32.219, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8590 MiB, 23920 MiB 2022/04/09 02:00:32.221, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8484 MiB, 24026 MiB 2022/04/09 02:00:32.221, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8484 MiB, 24026 MiB 2022/04/09 02:00:32.221, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8484 MiB, 24026 MiB 2022/04/09 02:00:32.225, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8498 MiB, 24012 MiB 2022/04/09 02:00:32.226, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8590 MiB, 23920 MiB 2022/04/09 02:00:32.226, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8590 MiB, 23920 MiB 2022/04/09 02:00:32.227, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 61 %, 32510 MiB, 8530 MiB, 23980 MiB 2022/04/09 02:00:32.228, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 61 %, 32510 MiB, 8530 MiB, 23980 MiB 2022/04/09 02:00:32.230, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8550 MiB, 23960 MiB 2022/04/09 02:00:32.230, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8550 MiB, 23960 MiB 2022/04/09 02:00:32.230, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8550 MiB, 23960 MiB 2022/04/09 02:00:32.234, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 61 %, 32510 MiB, 8530 MiB, 23980 MiB 2022/04/09 02:00:32.235, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 61 %, 32510 MiB, 8530 MiB, 23980 MiB 2022/04/09 02:00:32.236, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 61 %, 32510 MiB, 8414 MiB, 24096 MiB 2022/04/09 02:00:32.237, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 61 %, 32510 MiB, 8414 MiB, 24096 MiB 2022/04/09 02:00:32.238, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8590 MiB, 23920 MiB 2022/04/09 02:00:32.238, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8590 MiB, 23920 MiB 2022/04/09 02:00:32.239, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 62 %, 32510 MiB, 8590 MiB, 23920 MiB 2022/04/09 02:00:32.243, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 61 %, 32510 MiB, 8414 MiB, 24096 MiB 2022/04/09 02:00:32.243, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 61 %, 32510 MiB, 8414 MiB, 24096 MiB 2022/04/09 02:00:32.244, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 62 %, 32510 MiB, 8272 MiB, 24238 MiB 2022/04/09 02:00:32.245, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 62 %, 32510 MiB, 8272 MiB, 24238 MiB 2022/04/09 02:00:32.247, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 61 %, 32510 MiB, 8530 MiB, 23980 MiB 2022/04/09 02:00:32.247, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 61 %, 32510 MiB, 8530 MiB, 23980 MiB 2022/04/09 02:00:32.247, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 61 %, 32510 MiB, 8530 MiB, 23980 MiB 2022/04/09 02:00:32.251, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 62 %, 32510 MiB, 8272 MiB, 24238 MiB 2022/04/09 02:00:32.252, Tesla V100-SXM2-32GB, 470.57.02, 99 %, 62 %, 32510 MiB, 8272 MiB, 24238 MiB 2022/04/09 02:00:32.252, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8498 MiB, 24012 MiB 2022/04/09 02:00:32.254, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8498 MiB, 24012 MiB 2022/04/09 02:00:32.255, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 64 %, 32510 MiB, 8414 MiB, 24096 MiB 2022/04/09 02:00:32.256, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 64 %, 32510 MiB, 8414 MiB, 24096 MiB 2022/04/09 02:00:32.256, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 64 %, 32510 MiB, 8414 MiB, 24096 MiB 2022/04/09 02:00:32.261, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8498 MiB, 24012 MiB 2022/04/09 02:00:32.262, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8498 MiB, 24012 MiB 2022/04/09 02:00:32.265, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 63 %, 32510 MiB, 8272 MiB, 24238 MiB 2022/04/09 02:00:32.266, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 63 %, 32510 MiB, 8272 MiB, 24238 MiB 2022/04/09 02:00:32.266, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 63 %, 32510 MiB, 8272 MiB, 24238 MiB 2022/04/09 02:00:32.276, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8498 MiB, 24012 MiB 2022/04/09 02:00:32.277, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8498 MiB, 24012 MiB 2022/04/09 02:00:32.277, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 60 %, 32510 MiB, 8498 MiB, 24012 MiB [rank:5] [train], epoch: 0/1, iter: 200/625, loss: 0.86784, top1: 0.00109, throughput: 389.12 | 2022-04-09 02:01:37.626 [rank:4] [train], epoch: 0/1, iter: 200/625, loss: 0.86792, top1: 0.00047, throughput: 389.12 | 2022-04-09 02:01:37.628 [rank:0] [train], epoch: 0/1, iter: 200/625, loss: 0.86785, top1: 0.00082, throughput: 389.13 | 2022-04-09 02:01:37.632 [rank:1] [train], epoch: 0/1, iter: 200/625, loss: 0.86757, top1: 0.00133, throughput: 389.13 | 2022-04-09 02:01:37.632 [rank:7] [train], epoch: 0/1, iter: 200/625, loss: 0.86755, top1: 0.00133, throughput: 389.12 | 2022-04-09 02:01:37.632 [rank:6] [train], epoch: 0/1, iter: 200/625, loss: 0.86796, top1: 0.00078, throughput: 389.11 | 2022-04-09 02:01:37.629 [rank:2] [train], epoch: 0/1, iter: 200/625, loss: 0.86814, top1: 0.00098, throughput: 389.13 | 2022-04-09 02:01:37.630 [rank:3] [train], epoch: 0/1, iter: 200/625, loss: 0.86775, top1: 0.00086, throughput: 389.11 | 2022-04-09 02:01:37.635 [rank:6] [train], epoch: 0/1, iter: 300/625, loss: 0.86726, top1: 0.00074, throughput: 387.52 | 2022-04-09 02:02:43.690 [rank:5] [train], epoch: 0/1, iter: 300/625, loss: 0.86758, top1: 0.00094, throughput: 387.51 | 2022-04-09 02:02:43.688 [rank:2] [train], epoch: 0/1, iter: 300/625, loss: 0.86772, top1: 0.00090, throughput: 387.52 | 2022-04-09 02:02:43.691 [rank:4] [train], epoch: 0/1, iter: 300/625, loss: 0.86727, top1: 0.00098, throughput: 387.50 | 2022-04-09 02:02:43.692 [rank:1] [train], epoch: 0/1, iter: 300/625, loss: 0.86757, top1: 0.00125, throughput: 387.50 | 2022-04-09 02:02:43.696 [rank:0] [train], epoch: 0/1, iter: 300/625, loss: 0.86786, top1: 0.00086, throughput: 387.51 | 2022-04-09 02:02:43.695 [rank:3] [train], epoch: 0/1, iter: 300/625, loss: 0.86773, top1: 0.00070, throughput: 387.51 | 2022-04-09 02:02:43.697 [rank:7] [train], epoch: 0/1, iter: 300/625, loss: 0.86756, top1: 0.00113, throughput: 387.51 | 2022-04-09 02:02:43.695