Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Algos: make findSimilarStr constexpr #1129

Merged
merged 1 commit into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 32 additions & 119 deletions src/Algos.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include "Logger.hpp"
#include "Rustify/Result.hpp"

#include <algorithm>
#include <cctype>
#include <chrono>
#include <cstdlib>
Expand All @@ -13,7 +12,6 @@
#include <string_view>
#include <thread>
#include <utility>
#include <vector>

namespace cabin {

Expand Down Expand Up @@ -103,91 +101,6 @@ commandExists(const std::string_view cmd) noexcept {
.unwrap_or(false);
}

// ref: https://wandbox.org/permlink/zRjT41alOHdwcf00
static std::size_t
levDistance(const std::string_view lhs, const std::string_view rhs) noexcept {
const std::size_t lhsSize = lhs.size();
const std::size_t rhsSize = rhs.size();

// for all i and j, d[i,j] will hold the Levenshtein distance between the
// first i characters of s and the first j characters of t
std::vector<std::vector<std::size_t>> dist(
lhsSize + 1, std::vector<std::size_t>(rhsSize + 1)
);
dist[0][0] = 0;

// source prefixes can be transformed into empty string by dropping all
// characters
for (std::size_t i = 1; i <= lhsSize; ++i) {
dist[i][0] = i;
}

// target prefixes can be reached from empty source prefix by inserting every
// character
for (std::size_t j = 1; j <= rhsSize; ++j) {
dist[0][j] = j;
}

for (std::size_t i = 1; i <= lhsSize; ++i) {
for (std::size_t j = 1; j <= rhsSize; ++j) {
const std::size_t substCost = lhs[i - 1] == rhs[j - 1] ? 0 : 1;
dist[i][j] = std::min({
dist[i - 1][j] + 1, // deletion
dist[i][j - 1] + 1, // insertion
dist[i - 1][j - 1] + substCost // substitution
});
}
}

return dist[lhsSize][rhsSize];
}

static bool
equalsInsensitive(
const std::string_view lhs, const std::string_view rhs
) noexcept {
return std::ranges::equal(lhs, rhs, [](char lhs, char rhs) {
return std::tolower(lhs) == std::tolower(rhs);
});
}

std::optional<std::string_view>
findSimilarStr(
const std::string_view lhs, std::span<const std::string_view> candidates
) noexcept {
// We need to check if `Candidates` has the exact case-insensitive string
// because the Levenshtein distance match does not care about it.
for (const std::string_view str : candidates) {
if (equalsInsensitive(lhs, str)) {
return str;
}
}

// Keep going with the Levenshtein distance match.
// If the LHS size is less than 3, use the LHS size minus 1 and if not,
// use the LHS size divided by 3.
const std::size_t length = lhs.size();
const std::size_t maxDist = length < 3 ? length - 1 : length / 3;

std::optional<std::pair<std::string_view, std::size_t>> similarStr =
std::nullopt;
for (const std::string_view str : candidates) {
const std::size_t curDist = levDistance(lhs, str);
if (curDist <= maxDist) {
// The first similar string found || More similar string found
if (!similarStr.has_value() || curDist < similarStr->second) {
similarStr = { str, curDist };
}
}
}

if (similarStr.has_value()) {
return similarStr->first;
} else {
return std::nullopt;
}
}

} // namespace cabin

#ifdef CABIN_TEST
Expand Down Expand Up @@ -219,22 +132,22 @@ testLevDistance2() {
constexpr std::string_view str2 = "\nMary häd ä little lämb\n\nLittle lämb\n";
constexpr std::string_view str3 = "Mary häd ä little lämb\n\nLittle lämb\n";

assertEq(levDistance(str1, str2), 2UL);
assertEq(levDistance(str2, str1), 2UL);
assertEq(levDistance(str1, str3), 3UL);
assertEq(levDistance(str3, str1), 3UL);
assertEq(levDistance(str2, str3), 1UL);
assertEq(levDistance(str3, str2), 1UL);
static_assert(levDistance(str1, str2) == 2UL);
static_assert(levDistance(str2, str1) == 2UL);
static_assert(levDistance(str1, str3) == 3UL);
static_assert(levDistance(str3, str1) == 3UL);
static_assert(levDistance(str2, str3) == 1UL);
static_assert(levDistance(str3, str2) == 1UL);

assertEq(levDistance("b", "bc"), 1UL);
assertEq(levDistance("ab", "abc"), 1UL);
assertEq(levDistance("aab", "aabc"), 1UL);
assertEq(levDistance("aaab", "aaabc"), 1UL);
static_assert(levDistance("b", "bc") == 1UL);
static_assert(levDistance("ab", "abc") == 1UL);
static_assert(levDistance("aab", "aabc") == 1UL);
static_assert(levDistance("aaab", "aaabc") == 1UL);

assertEq(levDistance("a", "b"), 1UL);
assertEq(levDistance("ab", "ac"), 1UL);
assertEq(levDistance("aab", "aac"), 1UL);
assertEq(levDistance("aaab", "aaac"), 1UL);
static_assert(levDistance("a", "b") == 1UL);
static_assert(levDistance("ab", "ac") == 1UL);
static_assert(levDistance("aab", "aac") == 1UL);
static_assert(levDistance("aaab", "aaac") == 1UL);

pass();
}
Expand All @@ -248,21 +161,21 @@ testFindSimilarStr() {
"if", "ifdef", "ifndef", "elif", "else", "endif", "elifdef", "elifndef"
};

assertEq(findSimilarStr("id", candidates).value(), "if"sv);
assertEq(findSimilarStr("ifd", candidates).value(), "if"sv);
assertEq(findSimilarStr("ifde", candidates).value(), "ifdef"sv);
assertEq(findSimilarStr("elf", candidates).value(), "elif"sv);
assertEq(findSimilarStr("elsif", candidates).value(), "elif"sv);
assertEq(findSimilarStr("elseif", candidates).value(), "elif"sv);
assertEq(findSimilarStr("elfidef", candidates).value(), "elifdef"sv);
assertEq(findSimilarStr("elfindef", candidates).value(), "elifdef"sv);
assertEq(findSimilarStr("elfinndef", candidates).value(), "elifndef"sv);
assertEq(findSimilarStr("els", candidates).value(), "else"sv);
assertEq(findSimilarStr("endi", candidates).value(), "endif"sv);

assertFalse(findSimilarStr("i", candidates).has_value());
assertFalse(
findSimilarStr("special_compiler_directive", candidates).has_value()
static_assert(findSimilarStr("id", candidates) == "if"sv);
static_assert(findSimilarStr("ifd", candidates) == "if"sv);
static_assert(findSimilarStr("ifde", candidates) == "ifdef"sv);
static_assert(findSimilarStr("elf", candidates) == "elif"sv);
static_assert(findSimilarStr("elsif", candidates) == "elif"sv);
static_assert(findSimilarStr("elseif", candidates) == "elif"sv);
static_assert(findSimilarStr("elfidef", candidates) == "elifdef"sv);
static_assert(findSimilarStr("elfindef", candidates) == "elifdef"sv);
static_assert(findSimilarStr("elfinndef", candidates) == "elifndef"sv);
static_assert(findSimilarStr("els", candidates) == "else"sv);
static_assert(findSimilarStr("endi", candidates) == "endif"sv);

static_assert(!findSimilarStr("i", candidates).has_value());
static_assert(
!findSimilarStr("special_compiler_directive", candidates).has_value()
);

pass();
Expand All @@ -271,11 +184,11 @@ testFindSimilarStr() {
static void
testFindSimilarStr2() {
constexpr std::array<std::string_view, 2> candidates{ "aaab", "aaabc" };
assertEq(findSimilarStr("aaaa", candidates).value(), "aaab"sv);
assertFalse(findSimilarStr("1111111111", candidates).has_value());
static_assert(findSimilarStr("aaaa", candidates) == "aaab"sv);
static_assert(!findSimilarStr("1111111111", candidates).has_value());

constexpr std::array<std::string_view, 1> candidates2{ "AAAA" };
assertEq(findSimilarStr("aaaa", candidates2).value(), "AAAA"sv);
static_assert(findSimilarStr("aaaa", candidates2) == "AAAA"sv);

pass();
}
Expand Down
92 changes: 90 additions & 2 deletions src/Algos.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
#include "Command.hpp"
#include "Rustify/Result.hpp"

#include <algorithm>
#include <cstddef>
#include <optional>
#include <span>
#include <string>
#include <string_view>
#include <vector>

namespace cabin {

Expand All @@ -22,6 +24,59 @@ Result<std::string>
getCmdOutput(const Command& cmd, std::size_t retry = 3) noexcept;
bool commandExists(std::string_view cmd) noexcept;

// ref: https://wandbox.org/permlink/zRjT41alOHdwcf00
constexpr std::size_t
levDistance(const std::string_view lhs, const std::string_view rhs) noexcept {
const std::size_t lhsSize = lhs.size();
const std::size_t rhsSize = rhs.size();

// for all i and j, d[i,j] will hold the Levenshtein distance between the
// first i characters of s and the first j characters of t
std::vector<std::vector<std::size_t>> dist(
lhsSize + 1, std::vector<std::size_t>(rhsSize + 1)
);
dist[0][0] = 0;

// source prefixes can be transformed into empty string by dropping all
// characters
for (std::size_t i = 1; i <= lhsSize; ++i) {
dist[i][0] = i;
}

// target prefixes can be reached from empty source prefix by inserting every
// character
for (std::size_t j = 1; j <= rhsSize; ++j) {
dist[0][j] = j;
}

for (std::size_t i = 1; i <= lhsSize; ++i) {
for (std::size_t j = 1; j <= rhsSize; ++j) {
const std::size_t substCost = lhs[i - 1] == rhs[j - 1] ? 0 : 1;
dist[i][j] = std::min({
dist[i - 1][j] + 1, // deletion
dist[i][j - 1] + 1, // insertion
dist[i - 1][j - 1] + substCost // substitution
});
}
}

return dist[lhsSize][rhsSize];
}

constexpr char
toLower(char c) noexcept {
return (c >= 'A' && c <= 'Z') ? static_cast<char>(c + ('a' - 'A')) : c;
}

constexpr bool
equalsInsensitive(
const std::string_view lhs, const std::string_view rhs
) noexcept {
return std::ranges::equal(lhs, rhs, [](char lhs, char rhs) {
return toLower(lhs) == toLower(rhs);
});
}

// ref: https://reviews.llvm.org/differential/changeset/?ref=3315514
/// Find a similar string in `candidates`.
///
Expand All @@ -31,8 +86,41 @@ bool commandExists(std::string_view cmd) noexcept;
///
/// \returns a similar string if exists. If no similar string exists,
/// returns std::nullopt.
std::optional<std::string_view> findSimilarStr(
constexpr std::optional<std::string_view>
findSimilarStr(
std::string_view lhs, std::span<const std::string_view> candidates
) noexcept;
) noexcept {
// We need to check if `Candidates` has the exact case-insensitive string
// because the Levenshtein distance match does not care about it.
for (const std::string_view str : candidates) {
if (equalsInsensitive(lhs, str)) {
return str;
}
}

// Keep going with the Levenshtein distance match.
// If the LHS size is less than 3, use the LHS size minus 1 and if not,
// use the LHS size divided by 3.
const std::size_t length = lhs.size();
const std::size_t maxDist = length < 3 ? length - 1 : length / 3;

std::optional<std::pair<std::string_view, std::size_t>> similarStr =
std::nullopt;
for (const std::string_view str : candidates) {
const std::size_t curDist = levDistance(lhs, str);
if (curDist <= maxDist) {
// The first similar string found || More similar string found
if (!similarStr.has_value() || curDist < similarStr->second) {
similarStr = { str, curDist };
}
}
}

if (similarStr.has_value()) {
return similarStr->first;
} else {
return std::nullopt;
}
}

} // namespace cabin
Loading