Skip to content

[SYCLomatic] Fix the migration of cudaMemcpy in template function #2650

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 109 additions & 89 deletions clang/lib/DPCT/Utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3512,6 +3512,18 @@ void findRelatedDREOffsets(std::set<const clang::DeclRefExpr *> &DRESet,
std::sort(DREOffsetVec.begin(), DREOffsetVec.end());
}

std::string getDirectCalleeName(const CallExpr *CE) {
std::string FuncName = "";
if (CE->getDirectCallee()) {
FuncName = CE->getDirectCallee()->getNameInfo().getName().getAsString();
} else {
if (auto ULE = dyn_cast_or_null<UnresolvedLookupExpr>(CE->getCallee())) {
FuncName = ULE->getNameInfo().getAsString();
}
}
return FuncName;
}

bool analyzeMemcpyOrder(
const clang::CompoundStmt *CS,
std::vector<std::pair<const Stmt *, MemcpyOrderAnalysisNodeKind>>
Expand All @@ -3536,14 +3548,7 @@ bool analyzeMemcpyOrder(
const CallExpr *CE = Result.getNodeAs<CallExpr>("CallExpr");
if (!CE)
return false;
std::string FuncName = "";
if (CE->getDirectCallee()) {
FuncName = CE->getDirectCallee()->getNameInfo().getName().getAsString();
} else {
if (auto ULE = dyn_cast_or_null<UnresolvedLookupExpr>(CE->getCallee())) {
FuncName = ULE->getNameInfo().getAsString();
}
}
std::string FuncName = getDirectCalleeName(CE);
if (FuncName.empty())
return false;

Expand All @@ -3557,12 +3562,14 @@ bool analyzeMemcpyOrder(
CE, MemcpyOrderAnalysisNodeKind::MOANK_MemcpyInFlowControl);
} else {
// Record the first and second argument of memcpy
int DirectionArgIndex = 4;
unsigned int DirectionArgIndex = 4;
if (FuncName == "cudaMemcpy") {
DirectionArgIndex = 3;
}
if (auto Direction =
dyn_cast<DeclRefExpr>(CE->getArg(DirectionArgIndex))) {
const DeclRefExpr *Direction = nullptr;
if ((CE->getNumArgs() > DirectionArgIndex) &&
(Direction =
dyn_cast<DeclRefExpr>(CE->getArg(DirectionArgIndex)))) {
auto CpyKind = Direction->getDecl()->getName();
if (CpyKind == "cudaMemcpyDeviceToHost" ||
CpyKind == "cudaMemcpyHostToHost") {
Expand Down Expand Up @@ -3770,94 +3777,107 @@ bool canOmitMemcpyWait(const clang::CallExpr *CE) {
// ...
// cudaMemcpy(device_dst, &data, sizeof(int), cudaMemcpyHostToDevice);
// }
if (auto Direction = dyn_cast<DeclRefExpr>(CE->getArg(3))) {
auto CpyKind = Direction->getDecl()->getName();
if (CpyKind == "cudaMemcpyDeviceToDevice") {
return true;
}
if (CpyKind == "cudaMemcpyHostToDevice" &&
dpct::DpctGlobalInfo::isOptimizeMigration()) {
if (auto Body = getBodyofAncestorFCStmt(CE)) {
if (dpct::DpctGlobalInfo::isAncestor(Body, CE)) {
return false;
}
std::string FuncName = getDirectCalleeName(CE);
if (FuncName.empty())
return false;
unsigned int DirectionArgIndex = 4;
if (FuncName == "cudaMemcpy") {
DirectionArgIndex = 3;
}
std::string CpyKind;
if (CE->getNumArgs() > DirectionArgIndex) {
if (const DeclRefExpr *Direction =
dyn_cast<DeclRefExpr>(CE->getArg(DirectionArgIndex)))
CpyKind = Direction->getDecl()->getName().str();
} else if (FuncName == "cudaMemcpyFromSymbol") {
CpyKind = "cudaMemcpyDeviceToHost ";
} else if (FuncName == "cudaMemcpyToSymbol") {
CpyKind = "cudaMemcpyHostToDevice";
}

if (CpyKind == "cudaMemcpyDeviceToDevice") {
return true;
}
if (CpyKind == "cudaMemcpyHostToDevice" &&
dpct::DpctGlobalInfo::isOptimizeMigration()) {
if (auto Body = getBodyofAncestorFCStmt(CE)) {
if (dpct::DpctGlobalInfo::isAncestor(Body, CE)) {
return false;
}
auto SrcExpr = CE->getArg(1);
auto AddrOfMatcher =
clang::ast_matchers::findAll(clang::ast_matchers::unaryOperator(
clang::ast_matchers::hasOperatorName("&")));
auto AddrOfMatchedResults = clang::ast_matchers::match(
AddrOfMatcher, *SrcExpr, dpct::DpctGlobalInfo::getContext());
if (AddrOfMatchedResults.size() == 0) {
auto SyncPointMatcher = clang::ast_matchers::findAll(
clang::ast_matchers::callExpr(
clang::ast_matchers::callee(clang::ast_matchers::functionDecl(
clang::ast_matchers::hasAnyName("cudaDeviceSynchronize"))))
.bind("SyncPoint"));
auto CEBegLocInfo = dpct::DpctGlobalInfo::getLocInfo(CE->getBeginLoc());
auto CEEndLocInfo = dpct::DpctGlobalInfo::getLocInfo(CE->getEndLoc());
std::set<const clang::DeclRefExpr *> DRESet;
bool HasCallExpr = false;
bool isSrcPointerFreedAfterCE = false;
std::vector<const clang::DeclRefExpr *> DREMatchResult;
std::set<const clang::DeclRefExpr *> SrcDRESet;
std::set<unsigned int> SyncPointOffset;
auto SyncPointMatchedResults = clang::ast_matchers::match(
SyncPointMatcher, *CS, dpct::DpctGlobalInfo::getContext());
for (auto &SP : SyncPointMatchedResults) {
if (const CallExpr *SPCE = SP.getNodeAs<CallExpr>("SyncPoint")) {
if (auto Body = getBodyofAncestorFCStmt(SPCE)) {
if (dpct::DpctGlobalInfo::isAncestor(Body, SPCE)) {
continue;
}
}
auto SrcExpr = CE->getArg(1);
auto AddrOfMatcher =
clang::ast_matchers::findAll(clang::ast_matchers::unaryOperator(
clang::ast_matchers::hasOperatorName("&")));
auto AddrOfMatchedResults = clang::ast_matchers::match(
AddrOfMatcher, *SrcExpr, dpct::DpctGlobalInfo::getContext());
if (AddrOfMatchedResults.size() == 0) {
auto SyncPointMatcher = clang::ast_matchers::findAll(
clang::ast_matchers::callExpr(
clang::ast_matchers::callee(clang::ast_matchers::functionDecl(
clang::ast_matchers::hasAnyName("cudaDeviceSynchronize"))))
.bind("SyncPoint"));
auto CEBegLocInfo = dpct::DpctGlobalInfo::getLocInfo(CE->getBeginLoc());
auto CEEndLocInfo = dpct::DpctGlobalInfo::getLocInfo(CE->getEndLoc());
std::set<const clang::DeclRefExpr *> DRESet;
bool HasCallExpr = false;
bool isSrcPointerFreedAfterCE = false;
std::vector<const clang::DeclRefExpr *> DREMatchResult;
std::set<const clang::DeclRefExpr *> SrcDRESet;
std::set<unsigned int> SyncPointOffset;
auto SyncPointMatchedResults = clang::ast_matchers::match(
SyncPointMatcher, *CS, dpct::DpctGlobalInfo::getContext());
for (auto &SP : SyncPointMatchedResults) {
if (const CallExpr *SPCE = SP.getNodeAs<CallExpr>("SyncPoint")) {
if (auto Body = getBodyofAncestorFCStmt(SPCE)) {
if (dpct::DpctGlobalInfo::isAncestor(Body, SPCE)) {
continue;
}
SyncPointOffset.insert(
dpct::DpctGlobalInfo::getLocInfo(SPCE->getBeginLoc()).second);
}
SyncPointOffset.insert(
dpct::DpctGlobalInfo::getLocInfo(SPCE->getBeginLoc()).second);
}
auto checkIfSrcPointerFreedAfterCE = [&]() {
for (auto &D : DREMatchResult) {
if (auto ParentCE =
dpct::DpctGlobalInfo::findAncestor<CallExpr>(D)) {
auto DC = ParentCE->getDirectCallee();
if (!DC) {
continue;
}
std::string FuncName = getFunctionName(DC);
auto DRELocInfo =
dpct::DpctGlobalInfo::getLocInfo(D->getEndLoc());
if ((FuncName == "free" || FuncName == "cudaFreeHost") &&
(DRELocInfo.second > CEEndLocInfo.second)) {
bool FreeAfterSyncPoint = false;
for (auto &Offset : SyncPointOffset) {
if ((Offset > CEEndLocInfo.second) &&
(Offset < DRELocInfo.second)) {
FreeAfterSyncPoint = true;
break;
}
}
if (!FreeAfterSyncPoint) {
return true;
}
auto checkIfSrcPointerFreedAfterCE = [&]() {
for (auto &D : DREMatchResult) {
if (auto ParentCE = dpct::DpctGlobalInfo::findAncestor<CallExpr>(D)) {
auto DC = ParentCE->getDirectCallee();
if (!DC) {
continue;
}
std::string FuncName = getFunctionName(DC);
auto DRELocInfo = dpct::DpctGlobalInfo::getLocInfo(D->getEndLoc());
if ((FuncName == "free" || FuncName == "cudaFreeHost") &&
(DRELocInfo.second > CEEndLocInfo.second)) {
bool FreeAfterSyncPoint = false;
for (auto &Offset : SyncPointOffset) {
if ((Offset > CEEndLocInfo.second) &&
(Offset < DRELocInfo.second)) {
FreeAfterSyncPoint = true;
break;
}
}
if (!FreeAfterSyncPoint) {
return true;
}
}
}
return false;
};
findDREs(SrcExpr, SrcDRESet, HasCallExpr);
for (auto &SrcDRE : SrcDRESet) {
findAllVarRef(SrcDRE, DREMatchResult);
if (isSrcPointerFreedAfterCE = checkIfSrcPointerFreedAfterCE()) {
break;
}
DREMatchResult.clear();
}
if (!isSrcPointerFreedAfterCE) {
dpct::DiagnosticsUtils::report(
CEBegLocInfo.first, CEBegLocInfo.second,
dpct::Diagnostics::WAIT_REMOVE, true, false);
return true;
return false;
};
findDREs(SrcExpr, SrcDRESet, HasCallExpr);
for (auto &SrcDRE : SrcDRESet) {
findAllVarRef(SrcDRE, DREMatchResult);
if (isSrcPointerFreedAfterCE = checkIfSrcPointerFreedAfterCE()) {
break;
}
DREMatchResult.clear();
}
if (!isSrcPointerFreedAfterCE) {
dpct::DiagnosticsUtils::report(CEBegLocInfo.first, CEBegLocInfo.second,
dpct::Diagnostics::WAIT_REMOVE, true,
false);
return true;
}
}
}
Expand Down
26 changes: 26 additions & 0 deletions clang/test/dpct/USM-restricted.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1095,3 +1095,29 @@ int foo17() {
cudaMemcpy(d_in_data_2, h_data, mem_size, cudaMemcpyHostToDevice);
return 0;
}

#define MAX_MASK_WIDTH 10

template <typename T>
__constant__ T mask[MAX_MASK_WIDTH];

template <typename T>
void foo18() {
size_t size_bytes = sizeof(double);
double *a = (T *)malloc(size_bytes);
double *d_a;
cudaMalloc((void **)&d_a, size_bytes);
// CHECK: q_ct1.memcpy(d_a, a, size_bytes);
// CHECK-NEXT: T h_mask[MAX_MASK_WIDTH];
// CHECK-NEXT: q_ct1.memcpy(mask.get_ptr(), h_mask, sizeof(double)).wait();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Output/USM-restricted/USM-restricted.dp.cpp:1351:16: error: use of variable template 'mask' requires template arguments
1351 | q_ct1.memcpy(mask.get_ptr(), h_mask, sizeof(double)).wait();
| ^
Output/USM-restricted/USM-restricted.dp.cpp:1336:36: note: template is declared here
1335 | template
| ~~~~~~~~~~~~~~~~~~~~~
1336 | static dpct::constant_memory<T, 1> mask(MAX_MASK_WIDTH);
| ^

cudaMemcpy(d_a, a, size_bytes, cudaMemcpyHostToDevice);
T h_mask[MAX_MASK_WIDTH];
cudaMemcpyToSymbol(mask<double>, h_mask, sizeof(double));
}

int foo19() {
foo18<double>();
return 0;
}

#undef MAX_MASK_WIDTH