22
22
namespace onnxruntime {
23
23
namespace utils {
24
24
25
+ void NodeDumpAnalysis::Add (const std::string& node_name, const std::string& op_type, bool is_half_overflow) {
26
+ std::lock_guard<std::mutex> lock (set_mutex);
27
+ if (is_half_overflow) {
28
+ auto p = half_overflow_nodes.insert (node_name);
29
+ if (p.second ) { // insert succeeded
30
+ ++half_overflow_ops[op_type];
31
+ }
32
+ }
33
+
34
+ counter++;
35
+ }
36
+
37
+ void NodeDumpAnalysis::PrintToStdOut (const std::string& model_path) {
38
+ std::lock_guard<std::mutex> lock (set_mutex);
39
+ if (counter == 0 ) {
40
+ return ;
41
+ }
42
+
43
+ // We added counter twice per node (once for node inputs, once for node outputs), so we need to divide it by 2.
44
+ counter /= 2 ;
45
+
46
+ std::cout << " Total counter in node dumping: " << counter << std::endl;
47
+
48
+ if (!half_overflow_nodes.empty ()) {
49
+ std::cout << " Found " << half_overflow_nodes.size () << " nodes cannot be converted to half precision due to potential input/output overflow." << std::endl;
50
+
51
+ if (half_overflow_nodes.count (" " ) > 0 ) {
52
+ std::cout << " Warning: some node name is empty and node_block_list is not completed. "
53
+ << " Please update the model to make sure each node has name then run this tool again!" << std::endl;
54
+ }
55
+
56
+ // Sort and display the op frequency in the descending order
57
+ std::cout << " Operator frequencies for these nodes:" << std::endl;
58
+ std::vector<std::pair<std::string, int >> op_freq (half_overflow_ops.begin (), half_overflow_ops.end ());
59
+ std::sort (op_freq.begin (), op_freq.end (),
60
+ [](const std::pair<std::string, int >& a, const std::pair<std::string, int >& b) {
61
+ return b.second < a.second ;
62
+ });
63
+ for (const auto & pair : op_freq) {
64
+ std::cout << pair.first << " : " << pair.second << std::endl;
65
+ }
66
+ } else {
67
+ std::cout << " No node has potential overflow during half conversion so node_block_list is empty." << std::endl;
68
+ }
69
+
70
+ std::cout << " # -------" << std::endl;
71
+ std::cout << " # Example python script for float16 conversion" << std::endl;
72
+ std::cout << " # For details, search `node_block_list` in https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/float16.py" << std::endl;
73
+ std::cout << " # -------" << std::endl;
74
+ std::cout << " from onnxruntime.transformers.onnx_model import OnnxModel" << std::endl;
75
+ std::cout << " m = OnnxModel(onnx.load('" << model_path << " '))" << std::endl;
76
+ if (!half_overflow_nodes.empty ()) {
77
+ std::cout << " node_block_list = [" << std::endl;
78
+ for (const auto & node : half_overflow_nodes) {
79
+ if (!node.empty ()) {
80
+ std::cout << " '" << node << " '," << std::endl;
81
+ }
82
+ }
83
+ std::cout << " ]" << std::endl;
84
+ std::cout << " m.convert_float_to_float16(keep_io_types=False, node_block_list=node_block_list)" << std::endl;
85
+ } else {
86
+ std::cout << " m.convert_float_to_float16(keep_io_types=False)" << std::endl;
87
+ }
88
+
89
+ std::cout << " m.save_model_to_file('fp16/optimized.onnx', use_external_data_format=False)" << std::endl;
90
+ }
91
+
25
92
namespace {
26
93
27
94
struct TensorMetadata {
@@ -59,10 +126,13 @@ bool FilterNode(const NodeDumpOptions& dump_options, const Node& node) {
59
126
}
60
127
61
128
template <typename T>
62
- void DumpTensorToStdOut (const Tensor& tensor, const NodeDumpOptions& dump_options) {
63
- onnxruntime::utils::PrintCpuTensor<T>(tensor, dump_options.snippet_threshold , dump_options.snippet_edge_items );
64
- if (dump_options.dump_flags & NodeDumpOptions::DumpFlags::StatisticsData) {
65
- onnxruntime::utils::PrintCpuTensorStats<T>(tensor);
129
+ void DumpTensorToStdOut (const Tensor& tensor, const NodeDumpOptions& dump_options, TensorStatisticsData& tensor_statistics) {
130
+ if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0 ) {
131
+ onnxruntime::utils::PrintCpuTensor<T>(tensor, dump_options.snippet_threshold , dump_options.snippet_edge_items );
132
+ }
133
+
134
+ if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::StatisticsData) != 0 ) {
135
+ onnxruntime::utils::PrintCpuTensorStats<T>(tensor, tensor_statistics);
66
136
}
67
137
}
68
138
@@ -295,10 +365,10 @@ void InsertNodePlacementToSqliteDb(const NodeDumpContext& dump_context, const No
295
365
296
366
void DumpCpuTensor (
297
367
const NodeDumpOptions& dump_options,
298
- const Tensor& tensor, const TensorMetadata& tensor_metadata) {
368
+ const Tensor& tensor, const TensorMetadata& tensor_metadata, TensorStatisticsData& tensor_statistics ) {
299
369
switch (dump_options.data_destination ) {
300
370
case NodeDumpOptions::DataDestination::StdOut: {
301
- DispatchOnTensorType (tensor.DataType (), DumpTensorToStdOut, tensor, dump_options);
371
+ DispatchOnTensorType (tensor.DataType (), DumpTensorToStdOut, tensor, dump_options, tensor_statistics );
302
372
break ;
303
373
}
304
374
case NodeDumpOptions::DataDestination::TensorProtoFiles: {
@@ -321,15 +391,15 @@ void DumpCpuTensor(
321
391
322
392
void DumpTensor (
323
393
const NodeDumpOptions& dump_options,
324
- const Tensor& tensor, TensorMetadata& tensor_metadata,
394
+ const Tensor& tensor, TensorMetadata& tensor_metadata, TensorStatisticsData& tensor_statistics,
325
395
const SessionState& session_state) {
326
396
// check tensor is on CPU before dumping it
327
397
auto & tensor_location = tensor.Location ();
328
398
if (tensor_location.device .Type () == OrtDevice::CPU ||
329
399
tensor_location.mem_type == OrtMemTypeCPUInput ||
330
400
tensor_location.mem_type == OrtMemTypeCPUOutput) {
331
401
tensor_metadata.device_type = " CPU" ;
332
- DumpCpuTensor (dump_options, tensor, tensor_metadata);
402
+ DumpCpuTensor (dump_options, tensor, tensor_metadata, tensor_statistics );
333
403
} else {
334
404
std::cout << tensor_location << " \n " ;
335
405
@@ -345,7 +415,7 @@ void DumpTensor(
345
415
auto status = data_transfer_mgr.CopyTensor (tensor, cpu_tensor);
346
416
if (status == common::Status::OK ()) {
347
417
tensor_metadata.device_type = " GPU" ;
348
- DumpCpuTensor (dump_options, cpu_tensor, tensor_metadata);
418
+ DumpCpuTensor (dump_options, cpu_tensor, tensor_metadata, tensor_statistics );
349
419
} else {
350
420
std::cout << " failed to transfer data to cpu.\n " ;
351
421
}
@@ -383,6 +453,11 @@ const NodeDumpOptions& NodeDumpOptionsFromEnvironmentVariables() {
383
453
if (ParseEnvironmentVariableWithDefault<bool >(env_vars::kDumpStatisticsData , false )) {
384
454
opts.dump_flags |= NodeDumpOptions::DumpFlags::StatisticsData;
385
455
}
456
+ if (ParseEnvironmentVariableWithDefault<bool >(env_vars::kDumpHalfConversionOverflow , false )) {
457
+ // Statistics data is required for half conversion overflow detection.
458
+ opts.dump_flags |= NodeDumpOptions::DumpFlags::StatisticsData;
459
+ opts.dump_flags |= NodeDumpOptions::DumpFlags::HalfConversionOverflow;
460
+ }
386
461
387
462
opts.filter .name_pattern = Env::Default ().GetEnvironmentVar (env_vars::kNameFilter );
388
463
opts.filter .op_type_pattern = Env::Default ().GetEnvironmentVar (env_vars::kOpTypeFilter );
@@ -402,6 +477,13 @@ const NodeDumpOptions& NodeDumpOptionsFromEnvironmentVariables() {
402
477
opts.snippet_threshold = ParseEnvironmentVariableWithDefault<int >(env_vars::kSnippetThreshold , kDefaultSnippetThreshold );
403
478
opts.snippet_edge_items = ParseEnvironmentVariableWithDefault<int >(env_vars::kSnippetEdgeItems , kDefaultSnippetEdgeItems );
404
479
480
+ constexpr int kMaxHalfThreshold = 65504 ;
481
+ // The default value is set to have reasonable margin for input variance.
482
+ int threshold = ParseEnvironmentVariableWithDefault<int >(env_vars::kHalfOverflowThreshold , 50000 );
483
+ ORT_ENFORCE (threshold > 0 && threshold <= kMaxHalfThreshold ,
484
+ debug_node_inputs_outputs_env_vars::kHalfOverflowThreshold , " shall be a positive integer <= " , kMaxHalfThreshold );
485
+ opts.half_overflow_threshold = static_cast <float >(threshold);
486
+
405
487
if (ParseEnvironmentVariableWithDefault<bool >(env_vars::kAppendRankToFileName , false )) {
406
488
std::string rank = Env::Default ().GetEnvironmentVar (" OMPI_COMM_WORLD_RANK" );
407
489
if (rank.empty ()) {
@@ -452,7 +534,8 @@ void DumpNodeInputs(
452
534
const NodeDumpContext& dump_context,
453
535
const OpKernelContext& context,
454
536
const Node& node,
455
- const SessionState& session_state) {
537
+ const SessionState& session_state,
538
+ NodeDumpAnalysis& dump_analysis) {
456
539
const bool is_any_output_dumped = IsAnyOutputDumped (dump_options);
457
540
if (!is_any_output_dumped) {
458
541
return ;
@@ -477,6 +560,9 @@ void DumpNodeInputs(
477
560
const auto & input_defs = node.InputDefs ();
478
561
TensorMetadata tensor_metadata;
479
562
563
+ bool check_half_overflow = (dump_options.data_destination == NodeDumpOptions::DataDestination::StdOut) &&
564
+ (dump_options.dump_flags & NodeDumpOptions::DumpFlags::HalfConversionOverflow) != 0 ;
565
+ bool potential_half_overflow = false ;
480
566
for (auto i = 0 , end = context.InputCount (); i < end; ++i) {
481
567
if (input_defs[i]->Exists ()) {
482
568
std::cout << " Input " << i << " Name: " << input_defs[i]->Name () << " \n " ;
@@ -491,11 +577,20 @@ void DumpNodeInputs(
491
577
const bool is_shape_set = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::Shape) != 0 ;
492
578
PrintIf (is_shape_set, MakeString (" Shape: " , shape, " \n " ));
493
579
494
- if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0 ) {
580
+ if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0 || check_half_overflow ) {
495
581
tensor_metadata.name = input_defs[i]->Name ();
496
582
tensor_metadata.step = dump_context.iteration ;
497
583
tensor_metadata.consumer = node.Name () + " :" + std::to_string (i);
498
- DumpTensor (dump_options, *tensor, tensor_metadata, session_state);
584
+
585
+ TensorStatisticsData tensor_statistics;
586
+ DumpTensor (dump_options, *tensor, tensor_metadata, tensor_statistics, session_state);
587
+
588
+ if (check_half_overflow && tensor_statistics.is_float ) {
589
+ float threshold = dump_options.half_overflow_threshold ;
590
+ if (tensor_statistics.float_min < -threshold || tensor_statistics.float_max > threshold) {
591
+ potential_half_overflow = true ;
592
+ }
593
+ }
499
594
}
500
595
} else {
501
596
std::cout << " is empty optional tensor.\n " ;
@@ -511,22 +606,28 @@ void DumpNodeInputs(
511
606
std::cout << " Input " << i << " is optional and was not provided.\n " ;
512
607
}
513
608
}
609
+
610
+ if (check_half_overflow) {
611
+ dump_analysis.Add (node.Name (), node.OpType (), potential_half_overflow);
612
+ }
514
613
}
515
614
516
615
void DumpNodeInputs (
517
616
const NodeDumpContext& dump_context,
518
617
const OpKernelContext& context,
519
618
const Node& node,
520
- const SessionState& session_state) {
521
- DumpNodeInputs (NodeDumpOptionsFromEnvironmentVariables (), dump_context, context, node, session_state);
619
+ const SessionState& session_state,
620
+ NodeDumpAnalysis& dump_analysis) {
621
+ DumpNodeInputs (NodeDumpOptionsFromEnvironmentVariables (), dump_context, context, node, session_state, dump_analysis);
522
622
}
523
623
524
624
void DumpNodeOutputs (
525
625
const NodeDumpOptions& dump_options,
526
626
const NodeDumpContext& dump_context,
527
627
OpKernelContext& context,
528
628
const Node& node,
529
- const SessionState& session_state) {
629
+ const SessionState& session_state,
630
+ NodeDumpAnalysis& dump_analysis) {
530
631
const bool is_any_output_dumped = IsAnyOutputDumped (dump_options);
531
632
if (!is_any_output_dumped) {
532
633
return ;
@@ -549,6 +650,9 @@ void DumpNodeOutputs(
549
650
const auto & output_defs = node.OutputDefs ();
550
651
TensorMetadata tensor_metadata;
551
652
653
+ bool check_half_overflow = (dump_options.data_destination == NodeDumpOptions::DataDestination::StdOut) &&
654
+ (dump_options.dump_flags & NodeDumpOptions::DumpFlags::HalfConversionOverflow) != 0 ;
655
+ bool potential_half_overflow = false ;
552
656
for (auto i = 0 , end = context.OutputCount (); i < end; ++i) {
553
657
if (output_defs[i]->Exists ()) {
554
658
std::cout << " Output " << i << " Name: " << output_defs[i]->Name () << " \n " ;
@@ -562,11 +666,20 @@ void DumpNodeOutputs(
562
666
const bool is_shape_set = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::Shape) != 0 ;
563
667
PrintIf (is_shape_set, MakeString (" Shape: " , shape, " \n " ));
564
668
565
- if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0 ) {
669
+ if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0 || check_half_overflow ) {
566
670
tensor_metadata.name = output_defs[i]->Name ();
567
671
tensor_metadata.step = dump_context.iteration ;
568
672
tensor_metadata.producer = node.Name () + " :" + std::to_string (i);
569
- DumpTensor (dump_options, *tensor, tensor_metadata, session_state);
673
+
674
+ TensorStatisticsData tensor_statistics;
675
+ DumpTensor (dump_options, *tensor, tensor_metadata, tensor_statistics, session_state);
676
+
677
+ if (check_half_overflow && tensor_statistics.is_float ) {
678
+ float threshold = dump_options.half_overflow_threshold ;
679
+ if (tensor_statistics.float_min < -threshold || tensor_statistics.float_max > threshold) {
680
+ potential_half_overflow = true ;
681
+ }
682
+ }
570
683
}
571
684
} else {
572
685
std::cout << " is empty optional tensor.\n " ;
@@ -582,6 +695,10 @@ void DumpNodeOutputs(
582
695
std::cout << " Output " << i << " is optional and was not produced.\n " ;
583
696
}
584
697
698
+ if (check_half_overflow) {
699
+ dump_analysis.Add (node.Name (), node.OpType (), potential_half_overflow);
700
+ }
701
+
585
702
std::cout << std::endl;
586
703
}
587
704
}
@@ -590,8 +707,9 @@ void DumpNodeOutputs(
590
707
const NodeDumpContext& dump_context,
591
708
OpKernelContext& context,
592
709
const Node& node,
593
- const SessionState& session_state) {
594
- DumpNodeOutputs (NodeDumpOptionsFromEnvironmentVariables (), dump_context, context, node, session_state);
710
+ const SessionState& session_state,
711
+ NodeDumpAnalysis& dump_analysis) {
712
+ DumpNodeOutputs (NodeDumpOptionsFromEnvironmentVariables (), dump_context, context, node, session_state, dump_analysis);
595
713
}
596
714
597
715
} // namespace utils
0 commit comments