Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 41 additions & 9 deletions paddle/fluid/eager/api/utils/global_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,25 +195,57 @@ class EagerBackwardSubGraphNodeRecorder {
}

public:
void AddGradNode(const GradNodeBase* node) { set_.insert(node); }
void RemoveGradNode(const GradNodeBase* node) { set_.erase(node); }
bool ContainsGradNode(const GradNodeBase* node) { return set_.count(node); }
bool NeedCaptureSubGraph() { return need_capture_subgraph_; }
void StartCaptureSubGraph() { need_capture_subgraph_ = true; }
void EndCaptureSubGraph() { need_capture_subgraph_ = false; }
void AddGradNode(const GradNodeBase* node) {
if (need_capture_viz_subgraph_) viz_nodes_.insert(node);
if (need_capture_vlog_subgraph_) {
vlog_nodes_.insert(node);
// record the node's vlog level
node_to_vlog_level_[node] = subgraph_bwd_vlog_level_;
}
}

bool NeedCaptureSubGraph() {
return need_capture_viz_subgraph_ || need_capture_vlog_subgraph_;
}
void StartCaptureSubGraphForViz() { need_capture_viz_subgraph_ = true; }
void StopCaptureSubGraphForViz() { need_capture_viz_subgraph_ = false; }
void StartCaptureSubGraphForVlog() { need_capture_vlog_subgraph_ = true; }
void StopCaptureSubGraphForVlog() { need_capture_vlog_subgraph_ = false; }
void SetDumpDirPath(const std::string& path) { dump_dir_path_ = path; }
const std::string& GetDumpDirPath() { return dump_dir_path_; }
void SetNeedDumpGradTensors(bool need_dump) {
need_dump_grad_tensors_ = need_dump;
}
bool GetNeedDumpGradTensors() { return need_dump_grad_tensors_; }
bool HasCapturedSubgraph() { return !set_.empty(); }
int GetSubGraphBwdVlogLevel(const GradNodeBase* node) {
auto it = node_to_vlog_level_.find(node);
if (it != node_to_vlog_level_.end()) {
return it->second;
}
return 0;
}
void SetSubGraphBwdVlogLevel(int level) { subgraph_bwd_vlog_level_ = level; }
// Is Gradnode within the scope of backward vlog guard
bool IsGradNodeInVlogGuard(GradNodeBase* node) {
return vlog_nodes_.count(node);
}
bool IsGradNodeInVizGuard(const GradNodeBase* node) {
return viz_nodes_.count(node);
}
bool NeedDumpBwdSubGraph() {
return !viz_nodes_.empty() && !dump_dir_path_.empty();
}
bool NeedBwdVlogGuard() { return !vlog_nodes_.empty(); }

private:
std::unordered_set<const GradNodeBase*> set_;
std::unordered_set<const GradNodeBase*> viz_nodes_;
std::unordered_set<const GradNodeBase*> vlog_nodes_;
std::unordered_map<const GradNodeBase*, int> node_to_vlog_level_;
std::string dump_dir_path_;
bool need_capture_vlog_subgraph_ = false;
bool need_capture_viz_subgraph_ = false;
bool need_dump_grad_tensors_ = false;
bool need_capture_subgraph_ = false;
int subgraph_bwd_vlog_level_ = 0;
};

/**
Expand Down
25 changes: 14 additions & 11 deletions paddle/fluid/eager/backward.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ void ConstructForwardDebugDotGraph(const std::deque<GradNodeBase*>& init_queue,
}
visited.insert(node);
if (need_dump_backward_subgraph &&
!egr::EagerBackwardSubGraphNodeRecorder::Instance().ContainsGradNode(
node)) {
!egr::EagerBackwardSubGraphNodeRecorder::Instance()
.IsGradNodeInVizGuard(node)) {
// if we enable the need_dump_backward_subgraph the gradnode which is not
// related to subgraph will not be recorded
} else {
Expand Down Expand Up @@ -128,9 +128,9 @@ void ConstructForwardDebugDotGraph(const std::deque<GradNodeBase*>& init_queue,
// subgraph
if (need_dump_backward_subgraph &&
!egr::EagerBackwardSubGraphNodeRecorder::Instance()
.ContainsGradNode(node) &&
.IsGradNodeInVizGuard(node) &&
!egr::EagerBackwardSubGraphNodeRecorder::Instance()
.ContainsGradNode(next_node)) {
.IsGradNodeInVizGuard(next_node)) {
queue.push_back(next_node);
continue;
}
Expand All @@ -147,7 +147,7 @@ void ConstructForwardDebugDotGraph(const std::deque<GradNodeBase*>& init_queue,
} else {
if (need_dump_backward_subgraph &&
!egr::EagerBackwardSubGraphNodeRecorder::Instance()
.ContainsGradNode(next_node)) {
.IsGradNodeInVizGuard(next_node)) {
dot->AddNode(dot_next_node_label,
paddle::inference::analysis::orange_box_attrs,
dot_next_node_label,
Expand All @@ -163,10 +163,10 @@ void ConstructForwardDebugDotGraph(const std::deque<GradNodeBase*>& init_queue,
// if need_dump_backward_subgraph but next_node is in subgraph and node
// is not in subgraph we will add node in subgraph and add edge
if (need_dump_backward_subgraph &&
egr::EagerBackwardSubGraphNodeRecorder::Instance().ContainsGradNode(
next_node) &&
egr::EagerBackwardSubGraphNodeRecorder::Instance()
.IsGradNodeInVizGuard(next_node) &&
!egr::EagerBackwardSubGraphNodeRecorder::Instance()
.ContainsGradNode(node)) {
.IsGradNodeInVizGuard(node)) {
dot_node_label = CreateNodeLabelInDot(node);
// The node is not in subgraph but the node_next node is in subgraph
// we use orange_box to mark it too
Expand Down Expand Up @@ -244,7 +244,9 @@ std::vector<paddle::Tensor> RunBackward(

// Control variables related to debugging
bool need_dump_backward_subgraph =
egr::EagerBackwardSubGraphNodeRecorder::Instance().HasCapturedSubgraph();
egr::EagerBackwardSubGraphNodeRecorder::Instance().NeedDumpBwdSubGraph();
bool need_backward_vlog_guard =
egr::EagerBackwardSubGraphNodeRecorder::Instance().NeedBwdVlogGuard();
bool need_debug_backward_graph =
!dump_backward_graph_path.empty() || need_dump_backward_subgraph;
//
Expand Down Expand Up @@ -458,6 +460,7 @@ std::vector<paddle::Tensor> RunBackward(
<< " Preparing ";
try {
queue.pop_front();
egr::LogLevelGuardBackward log_guard(need_backward_vlog_guard, node);

// Construct backward graph for debug
std::string dot_node_label = "";
Expand Down Expand Up @@ -588,9 +591,9 @@ std::vector<paddle::Tensor> RunBackward(
need_dump_backward_subgraph);
if (need_dump_grad_tensors &&
(egr::EagerBackwardSubGraphNodeRecorder::Instance()
.ContainsGradNode(node) ||
.IsGradNodeInVizGuard(node) ||
egr::EagerBackwardSubGraphNodeRecorder::Instance()
.ContainsGradNode(next_node))) {
.IsGradNodeInVizGuard(next_node))) {
debug_grad_tensors_str += egr::FormatTensor(grad_output_tensor);
}
}
Expand Down
49 changes: 36 additions & 13 deletions paddle/fluid/eager/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,17 @@
#include "paddle/fluid/eager/tensor_wrapper.h"

#include "paddle/common/layout.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/op_call_stack.h"
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/phi/api/all.h"
#include "paddle/phi/api/lib/data_transform.h"
#include "paddle/phi/common/logging_utils.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/tensor_formatter.h"

#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/op_call_stack.h"
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/framework/variable.h"

#include "paddle/utils/md5.h"
COMMON_DECLARE_bool(enable_unique_name);
COMMON_DECLARE_int32(tensor_md5_checksum_precision);
Expand Down Expand Up @@ -1647,7 +1647,7 @@ std::string AddNodeToDebugBackwardGraph(Dot* dot,
// EagerBackwardSubGraphNodeRecorder. If we need capture subgraph, the
// gradnode not related subgraph will not be captured
if (need_dump_backward_subgraph &&
!egr::EagerBackwardSubGraphNodeRecorder::Instance().ContainsGradNode(
!egr::EagerBackwardSubGraphNodeRecorder::Instance().IsGradNodeInVizGuard(
node)) {
// no need to add node to dot graph
} else {
Expand All @@ -1669,9 +1669,9 @@ void AddEdgeToDebugBackwardGraph(Dot* dot,
bool need_dump_backward_subgraph) {
std::string dot_node_label = node_label;
if (need_dump_backward_subgraph &&
!egr::EagerBackwardSubGraphNodeRecorder::Instance().ContainsGradNode(
!egr::EagerBackwardSubGraphNodeRecorder::Instance().IsGradNodeInVizGuard(
node) &&
!egr::EagerBackwardSubGraphNodeRecorder::Instance().ContainsGradNode(
!egr::EagerBackwardSubGraphNodeRecorder::Instance().IsGradNodeInVizGuard(
next_node)) {
// if we need capture subgraph, the gradnode not related subgraph
// will not be captured
Expand All @@ -1685,8 +1685,8 @@ void AddEdgeToDebugBackwardGraph(Dot* dot,
false);
} else {
if (need_dump_backward_subgraph == false ||
egr::EagerBackwardSubGraphNodeRecorder::Instance().ContainsGradNode(
next_node)) {
egr::EagerBackwardSubGraphNodeRecorder::Instance()
.IsGradNodeInVizGuard(next_node)) {
dot->AddNode(dot_next_node_label,
paddle::inference::analysis::grey_box_attrs,
dot_next_node_label,
Expand All @@ -1704,10 +1704,10 @@ void AddEdgeToDebugBackwardGraph(Dot* dot,
// if need_dump_backward_subgraph but next_node is in subgraph and node is
// not in subgraph we will add node in subgraph and add edge
if (need_dump_backward_subgraph &&
egr::EagerBackwardSubGraphNodeRecorder::Instance().ContainsGradNode(
egr::EagerBackwardSubGraphNodeRecorder::Instance().IsGradNodeInVizGuard(
next_node) &&
!egr::EagerBackwardSubGraphNodeRecorder::Instance().ContainsGradNode(
node)) {
!egr::EagerBackwardSubGraphNodeRecorder::Instance()
.IsGradNodeInVizGuard(node)) {
dot_node_label = CreateNodeLabelInDot(node);
// The node is not in subgraph but the node_next node is in subgraph
// we use orange_box to mark it too
Expand Down Expand Up @@ -1827,4 +1827,27 @@ void CheckGradNodeAccumulation(
}
}
}

LogLevelGuardBackward::LogLevelGuardBackward(bool need_backward_vlog_guard,
GradNodeBase* node) {
//
if (need_backward_vlog_guard &&
egr::EagerBackwardSubGraphNodeRecorder::Instance().IsGradNodeInVlogGuard(
node)) {
saved_level_ = FLAGS_v;
SetVLOGLevel(egr::EagerBackwardSubGraphNodeRecorder::Instance()
.GetSubGraphBwdVlogLevel(node));
initialized_ = true;
}
}
void LogLevelGuardBackward::SetVLOGLevel(int level) {
FLAGS_v = level;
phi::set_phi_vlog_level(level);
}
LogLevelGuardBackward::~LogLevelGuardBackward() {
if (PD_UNLIKELY(initialized_)) {
// We should restore the log level
SetVLOGLevel(saved_level_);
}
}
} // namespace egr
13 changes: 13 additions & 0 deletions paddle/fluid/eager/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -505,4 +505,17 @@ void CheckGradNodeAccumulation(
void CheckGradNodeAccumulation(const std::vector<paddle::Tensor>& tensors);
void CheckGradNodeAccumulation(
const std::vector<std::vector<paddle::Tensor*>>& tensors);

class LogLevelGuardBackward {
public:
explicit LogLevelGuardBackward(bool need_backward_vlog_guard,
GradNodeBase* node);
LogLevelGuardBackward() = delete;
~LogLevelGuardBackward();

private:
void SetVLOGLevel(int level);
bool initialized_ = false;
int saved_level_ = 0;
};
} // namespace egr
71 changes: 47 additions & 24 deletions paddle/fluid/pybind/eager_functions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1520,37 +1520,56 @@ PyObject* eager__for_test_check_cuda_error(PyObject* self,

EAGER_CATCH_AND_THROW_RETURN_NULL
}
PyObject* eager__start_capture_debug_backward_subgraph(PyObject* self,
PyObject* args,
PyObject* kwargs) {
PyObject* eager__start_capture_backward_viz_subgraph(PyObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY

egr::EagerBackwardSubGraphNodeRecorder::Instance().StartCaptureSubGraph();
std::string dump_dir_path =
CastPyArg2AttrString(PyTuple_GET_ITEM(args, 0), 0);
bool need_dump_grad_tensors =
CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
egr::EagerBackwardSubGraphNodeRecorder::Instance().SetDumpDirPath(
dump_dir_path);
egr::EagerBackwardSubGraphNodeRecorder::Instance().SetNeedDumpGradTensors(
need_dump_grad_tensors);
egr::EagerBackwardSubGraphNodeRecorder::Instance()
.StartCaptureSubGraphForViz();
RETURN_PY_NONE

EAGER_CATCH_AND_THROW_RETURN_NULL
}

PyObject* eager__end_capture_debug_backward_subgraph(PyObject* self,
PyObject* eager__stop_capture_backward_viz_subgraph(PyObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY

egr::EagerBackwardSubGraphNodeRecorder::Instance()
.StopCaptureSubGraphForViz();
RETURN_PY_NONE
EAGER_CATCH_AND_THROW_RETURN_NULL
}
PyObject* eager__stop_capture_backward_vlog_subgraph(PyObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY
egr::EagerBackwardSubGraphNodeRecorder::Instance().EndCaptureSubGraph();
egr::EagerBackwardSubGraphNodeRecorder::Instance()
.StopCaptureSubGraphForVlog();
RETURN_PY_NONE
EAGER_CATCH_AND_THROW_RETURN_NULL
}
PyObject* eager__init_backward_subgraph_recorder(PyObject* self,
PyObject* args,
PyObject* kwargs) {

PyObject* eager__start_capture_backward_vlog_subgraph(PyObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY
std::string dump_dir_path =
CastPyArg2AttrString(PyTuple_GET_ITEM(args, 0), 0);
bool need_dump_grad_tensors =
CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
egr::EagerBackwardSubGraphNodeRecorder::Instance().SetDumpDirPath(
dump_dir_path);
egr::EagerBackwardSubGraphNodeRecorder::Instance().SetNeedDumpGradTensors(
need_dump_grad_tensors);

int subgraph_vlog_level = CastPyArg2AttrInt(PyTuple_GET_ITEM(args, 0), 0);

egr::EagerBackwardSubGraphNodeRecorder::Instance().SetSubGraphBwdVlogLevel(
subgraph_vlog_level);
egr::EagerBackwardSubGraphNodeRecorder::Instance()
.StartCaptureSubGraphForVlog();
RETURN_PY_NONE

EAGER_CATCH_AND_THROW_RETURN_NULL
Expand Down Expand Up @@ -1642,16 +1661,20 @@ PyMethodDef variable_functions[] = { // NOLINT
(PyCFunction)(void (*)())eager__add_doc_str,
METH_VARARGS,
nullptr},
{"_start_capture_debug_backward_subgraph",
(PyCFunction)(void (*)())eager__start_capture_debug_backward_subgraph,
{"_start_capture_backward_viz_subgraph",
(PyCFunction)(void (*)())eager__start_capture_backward_viz_subgraph,
METH_VARARGS | METH_KEYWORDS,
nullptr},
{"_end_capture_debug_backward_subgraph",
(PyCFunction)(void (*)())eager__end_capture_debug_backward_subgraph,
{"_stop_capture_backward_viz_subgraph",
(PyCFunction)(void (*)())eager__stop_capture_backward_viz_subgraph,
METH_VARARGS,
nullptr},
{"_init_backward_subgraph_recorder",
(PyCFunction)(void (*)())eager__init_backward_subgraph_recorder,
{"_start_capture_backward_vlog_subgraph",
(PyCFunction)(void (*)())eager__start_capture_backward_vlog_subgraph,
METH_VARARGS | METH_KEYWORDS,
nullptr},
{"_stop_capture_backward_vlog_subgraph",
(PyCFunction)(void (*)())eager__stop_capture_backward_vlog_subgraph,
METH_VARARGS,
nullptr},
/**sparse functions**/
Expand Down
15 changes: 12 additions & 3 deletions python/paddle/base/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -8634,14 +8634,23 @@ def capture_backward_subgraph_guard(
assert dump_dir_path is not None, "The dump_dir_path should not be None"
# for multi process
check_and_create_dir(dump_dir_path)
paddle.base.core.eager._init_backward_subgraph_recorder(
paddle.base.core.eager._start_capture_backward_viz_subgraph(
dump_dir_path, need_dump_grad_tensors
)
paddle.base.core.eager._start_capture_debug_backward_subgraph()
try:
yield
finally:
paddle.base.core.eager._end_capture_debug_backward_subgraph()
paddle.base.core.eager._stop_capture_backward_viz_subgraph()


@signature_safe_contextmanager
def backward_vlog_guard(level: int) -> Generator[None, None, None]:
assert isinstance(level, int), "vlog level is not an int"
paddle.base.core.eager._start_capture_backward_vlog_subgraph(level)
try:
yield
finally:
paddle.base.core.eager._stop_capture_backward_vlog_subgraph()


@signature_safe_contextmanager
Expand Down
Loading
Loading