diff --git a/example/auto_compression/detection/paddle_inference_eval.py b/example/auto_compression/detection/paddle_inference_eval.py index d2e12afd1..c66ce7113 100644 --- a/example/auto_compression/detection/paddle_inference_eval.py +++ b/example/auto_compression/detection/paddle_inference_eval.py @@ -82,9 +82,15 @@ def argsparser(): parser.add_argument("--img_shape", type=int, default=640, help="input_size") parser.add_argument( '--include_nms', - type=bool, - default=True, + type=str, + default='True', help="Whether include nms or not.") + # 是否用来测速 + parser.add_argument( + '--speed', + type=str, + default='True', + help="if speed is True, it will print the inference time.") return parser @@ -238,9 +244,11 @@ def load_predictor( config = Config( os.path.join(model_dir, "model.pdmodel"), os.path.join(model_dir, "model.pdiparams")) + + config.enable_memory_optim() if device == "GPU": # initial GPU memory(M), device ID - config.enable_use_gpu(200, 0) + config.enable_use_gpu(1000, 0) # optimize graph and fuse op config.switch_ir_optim(True) else: @@ -260,7 +268,7 @@ def load_predictor( } if precision in precision_map.keys() and use_trt: config.enable_tensorrt_engine( - workspace_size=(1 << 25) * batch_size, + workspace_size=(1 << 30) * batch_size, max_batch_size=batch_size, min_subgraph_size=min_subgraph_size, precision_mode=precision_map[precision], @@ -297,6 +305,7 @@ def predict_image(predictor, img, scale_factor = image_preprocess(image_file, image_shape) inputs = {} inputs["image"] = img + if FLAGS.include_nms: inputs['scale_factor'] = scale_factor input_names = predictor.get_input_names() @@ -354,6 +363,9 @@ def eval(predictor, val_loader, metric, rerun_flag=False): input_names = predictor.get_input_names() output_names = predictor.get_output_names() boxes_tensor = predictor.get_output_handle(output_names[0]) + print("output_names:", output_names) + print("Number of outputs:", len(output_names)) + print("FLAGS.include_nms:", FLAGS.include_nms) if FLAGS.include_nms: boxes_num = predictor.get_output_handle(output_names[1]) for batch_id, data in enumerate(val_loader): @@ -374,27 +386,79 @@ def eval(predictor, val_loader, metric, rerun_flag=False): time_min = min(time_min, timed) time_max = max(time_max, timed) predict_time += timed - if not FLAGS.include_nms: + # print("FLAGS.include_nms:", FLAGS.include_nms) + # print("FLAGS.speed:", FLAGS.speed) + # 如果include_nms为false且flags.speed为True,则走PPYOLOEPostProcess + if not FLAGS.include_nms and FLAGS.speed: + # print("nms为True的时候走了PPYOLOEPostProcess") postprocess = PPYOLOEPostProcess( score_threshold=0.3, nms_threshold=0.6) res = postprocess(np_boxes, data_all['scale_factor']) - else: + #如果include_nms为false且flags.speed为False,则跳过 + elif not FLAGS.include_nms and not FLAGS.speed: + continue + #如果include_nms,则直接返回 + elif FLAGS.include_nms: + # print("nms为False的时候直接返回") res = {'bbox': np_boxes, 'bbox_num': np_boxes_num} metric.update(data_all, res) if batch_id % 100 == 0: print("Eval iter:", batch_id) sys.stdout.flush() metric.accumulate() - metric.log() + if not FLAGS.speed: + metric.log() map_res = metric.get_results() metric.reset() time_avg = predict_time / sample_nums print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format( round(time_min * 1000, 2), round(time_max * 1000, 1), round(time_avg * 1000, 1))) - print("[Benchmark] COCO mAP: {}".format(map_res["bbox"][0])) + if not FLAGS.speed: + print("[Benchmark] COCO mAP: {}".format(map_res["bbox"][0])) sys.stdout.flush() +def inference_time(predictor, val_loader, metric, rerun_flag=False): + cpu_mems, gpu_mems = 0, 0 + predict_time = 0.0 + time_min = float("inf") + time_max = float("-inf") + sample_nums = len(val_loader) + input_names = predictor.get_input_names() + output_names = predictor.get_output_names() + boxes_tensor = predictor.get_output_handle(output_names[0]) + print("output_names:", output_names) + print("Number of outputs:", len(output_names)) + print("FLAGS.include_nms:", FLAGS.include_nms) + if FLAGS.include_nms: + boxes_num = predictor.get_output_handle(output_names[1]) + + for batch_id, data in enumerate(val_loader): + data_all = {k: np.array(v) for k, v in data.items()} + for i, _ in enumerate(input_names): + input_tensor = predictor.get_input_handle(input_names[i]) + input_tensor.copy_from_cpu(data_all[input_names[i]]) + paddle.device.cuda.synchronize() + start_time = time.time() + predictor.run() + # np_boxes = boxes_tensor.copy_to_cpu() + if FLAGS.include_nms: + np_boxes_num = boxes_num.copy_to_cpu() + if rerun_flag: + return + end_time = time.time() + timed = end_time - start_time + time_min = min(time_min, timed) + time_max = max(time_max, timed) + predict_time += timed + # print("FLAGS.include_nms:", FLAGS.include_nms) + # print("FLAGS.speed:", FLAGS.speed) + # 如果include_nms为false且flags.speed为True,则走PPYOLOEPostProcess + time_avg = predict_time / sample_nums + print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format( + round(time_min * 1000, 2), + round(time_max * 1000, 1), round(time_avg * 1000, 1))) + sys.stdout.flush() def main(): """ @@ -421,7 +485,7 @@ def main(): repeats=repeats) else: reader_cfg = load_config(FLAGS.reader_config) - + dataset = reader_cfg["EvalDataset"] global val_loader val_loader = create("EvalReader")( @@ -432,7 +496,10 @@ def main(): anno_file = dataset.get_anno() metric = COCOMetric( anno_file=anno_file, clsid2catid=clsid2catid, IouType="bbox") - eval(predictor, val_loader, metric, rerun_flag=rerun_flag) + if not FLAGS.speed: + eval(predictor, val_loader, metric, rerun_flag=rerun_flag) + else: + inference_time(predictor, val_loader, metric, rerun_flag=rerun_flag) if rerun_flag: print( @@ -444,6 +511,13 @@ def main(): paddle.enable_static() parser = argsparser() FLAGS = parser.parse_args() + if FLAGS.include_nms=='True': + FLAGS.include_nms = True + else: + FLAGS.include_nms = False + + print('**************main****************') + print(FLAGS) # DataLoader need run on cpu paddle.set_device("cpu")