loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1 ================================================================== Running resnet50: num_gpu_per_node = 1, num_nodes = 1. ================================================================== dtype = float32 gpu_num_per_node = 1 num_nodes = 1 node_ips = ['10.7.156.104'] ctrl_port = 50051 model = resnet50 use_fp16 = None use_xla = None channel_last = False pad_output = None num_epochs = 1 model_load_dir = None save_epoch_interval = 10 save_last = False save_init = False batch_size_per_device = 256 val_batch_size_per_device = 8 nccl_fusion_threshold_mb = 16 nccl_fusion_max_ops = 24 fuse_bn_relu = True fuse_bn_add_relu = True gpu_image_decoder = True image_path = test_img/tiger.jpg num_classes = 1000 num_examples = 1281167 num_val_examples = 50000 rgb_mean = [123.68, 116.779, 103.939] rgb_std = [58.393, 57.12, 57.375] image_shape = [3, 224, 224] label_smoothing = 0.1 model_save_dir = ./output/snapshots/model_save-20220403015414 log_dir = ./output loss_print_every_n_iter = 100 exit_iter = 300 image_size = 224 resize_shorter = 256 train_data_dir = /dataset/79846248/train train_data_part_num = 256 val_data_dir = None val_data_part_num = 256 optimizer = sgd learning_rate = 0.256 wd = 3.0517578125e-05 momentum = 0.875 lr_decay = cosine lr_decay_rate = 0.94 lr_decay_epochs = 2 warmup_epochs = 5 decay_rate = 0.9 epsilon = 1.0 gradient_clipping = 0.0 ------------------------------------------------------------------ Time stamp: 2022-04-03-01:54:14 300 iter per epoch... Loading data from /dataset/79846248/train Optimizer: SGD Loading synthetic data. train: epoch 1, iter 100, loss: 7.075469, top_1: 0.000781, top_k: 0.004922, samples/s: 300.001 1648922139.3551774 timestamp, name, driver_version, utilization.gpu [%], utilization.memory [%], memory.total [MiB], memory.free [MiB], memory.used [MiB] 2022/04/03 01:55:40.015, Tesla V100-SXM2-32GB, 470.57.02, 100 %, 72 %, 32510 MiB, 9046 MiB, 23464 MiB train: epoch 1, iter 200, loss: 6.945951, top_1: 0.001055, top_k: 0.005391, samples/s: 389.121 1648922205.1442726 train: epoch 1, iter 300, loss: 6.929328, top_1: 0.002383, top_k: 0.009219, samples/s: 387.936 1648922271.1345644