Skip to content

Commit

Permalink
linear_fingerprint: ring bits
Browse files Browse the repository at this point in the history
  • Loading branch information
IanAWatson committed Nov 21, 2024
1 parent f5f7ed9 commit 262fa14
Show file tree
Hide file tree
Showing 13 changed files with 370 additions and 86 deletions.
1 change: 1 addition & 0 deletions contrib/bin/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
!lib
7 changes: 7 additions & 0 deletions docs/Molecule_Tools/grep_molecule.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,13 @@ Worst case would be finding a set of molecules in itself, which would require
the full unique smiles computation. Nevertheless, doing this on 20k random
molecules takes just 2.7 seconds.

## Options
Chirality can be removed from both the needle molecule and the molecules to
be searched, add the -c option.

Molecules can be reduced to the largest fragment via the -l option. Again,
the transformation is applied to both the needle and the haystack molecules.

## Further Optimisation
The number of chiral centres could be included with the aromatic molecular formula,
or perhaps discerned from the starting smiles. If chirality is being considered
Expand Down
3 changes: 1 addition & 2 deletions docs/Molecule_Tools/iwdescr.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ see various missing columns.
## Descriptors.
The following descriptors are computed.

| ---- | ---------- |
| name | definition |
| ---- | ---------- |
| natoms | the number of atoms in the molecule |
Expand Down Expand Up @@ -323,7 +322,7 @@ off all optional descriptors. This can make a big difference in run times. Runni
```
iwdescr.sh -O all file.smi > file.w
```
takes 5.7 seconds to process 20k molecules, generating 264 columns of ouput. Running
takes 5.7 seconds to process 20k molecules, generating 277 columns of ouput. Running
```
iwdescr.sh -O none file.smi > file.w
```
Expand Down
24 changes: 24 additions & 0 deletions src/Foundational/iwmisc/proto_for_testing.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
syntax = "proto3";

package for_testing;

message SubMessage {
optional int32 i1 = 1;
optional string str1 = 2;
}

message TestMessage {
optional string str1 = 1;
optional string str2 = 2;

optional int32 i1 = 3;
optional uint32 ui1 = 4;
optional float x = 5;

repeated int32 int_array = 6;
repeated float float_array = 7;

repeated string repeated_string = 8;

optional SubMessage sub_message = 9;
}
13 changes: 13 additions & 0 deletions src/Foundational/iwstring/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ cc_library(
"remove_suffix.cc",
"string_change.cc",
"string_relationals.cc",
"tokenise_with_quotes.cc",
"unhtml.cc",
],
hdrs = [
Expand Down Expand Up @@ -88,3 +89,15 @@ cc_test(
],
timeout="short",
)

cc_test(
name = "tokenise_with_quotes_test",
srcs = [
"tokenise_with_quotes_test.cc",
],
deps = [
":iwstring",
"@googletest//:gtest",
"@googletest//:gtest_main",
],
)
18 changes: 18 additions & 0 deletions src/Foundational/iwstring/iwstring.h
Original file line number Diff line number Diff line change
Expand Up @@ -1023,6 +1023,24 @@ Equals(const const_IWSubstring& lhs, const std::string_view& rhs) {
return 0 == ::strncmp(lhs.data(), rhs.data(), lhs.length());
}

// Used for reading records from tabular files where there might
// be quoted tokens.
// For each token encountered, add to `tstart` and `tstop` the
// start and stop for that token - excluding quotes.
// The output can be processed with something like:
//
// int ntokens = TokeniseWithQuotes(buffer, ',', _tstart, _tstop);
// if (ntokens < 0) .... fail.
// for (int i = 0; i < ntokens; ++i) {
// int b = _tstart[i];
// int e = _tstop[i];
// const_IWSubstring token(buffer.rawdata() + b, e - b);
int
TokeniseWithQuotes(const const_IWSubstring& buffer,
char sep,
resizable_array<int>& tstart,
resizable_array<int>& tstop);

} // namespace iwstring

inline std::ostream &
Expand Down
105 changes: 105 additions & 0 deletions src/Foundational/iwstring/tokenise_with_quotes.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#include <iostream>

#include "Foundational/iwstring/iwstring.h"

namespace iwstring {

using std::cerr;

constexpr int kInvalid = -1;

int
TokeniseWithQuotes(const const_IWSubstring& buffer,
char sep,
resizable_array<int>& tstart,
resizable_array<int>& tstop) {
tstart.resize_keep_storage(0);
tstop.resize_keep_storage(0);

static constexpr char kDQuote = '"';

const int nchars = buffer.length();
if (nchars == 0) {
return 0;
}

// Maybe allow for empty token at start?
if (buffer[0] == sep) {
return kInvalid;
}

bool inside_quoted_string = false;
int ntokens = 1;

if (buffer[0] == kDQuote) {
tstart << 1;
inside_quoted_string = true;
} else {
tstart << 0;
}

for (int i = 1; i < nchars; ++i) {
const char c = buffer[i];
char next_char;
if (i == nchars - 1) {
next_char = '\0';
} else {
next_char = buffer[i + 1];
}

if (inside_quoted_string) {
if (c == kDQuote && (next_char == sep || next_char == '\0')) {
inside_quoted_string = false;
}
} else if (c == sep) {
if (buffer[i-1] == kDQuote) {
tstop << (i - 1);
} else {
tstop << (i - 0);
}
if (next_char == kDQuote) {
tstart << (i + 2);
} else {
tstart << (i + 1);
}
++ntokens;
} else if (c == kDQuote && buffer[i-1] == sep) {
inside_quoted_string = true;
}
}

if (inside_quoted_string) {
cerr << "TokeniseWithQuotes:unclosed quote '" << buffer << "'\n";
return kInvalid;
}

if (buffer.ends_with(kDQuote)) {
tstop << (buffer.length() - 1);
} else {
tstop << (buffer.length() - 0);
}

if (tstart.size() != tstop.size()) {
cerr << "TokeniseWithQuotes::Mismatch between opening and closing tokens\n";
cerr << tstart.size() << " vs " << tstop.size() << '\n';
return kInvalid;
}

if (tstart.number_elements() != ntokens) {
cerr << "TokeniseWithQuotes:Mismatch btw tokens " << ntokens <<
" and array size " << tstart.size() << '\n';
return kInvalid;
}

// #define DEBUG_TOKENISE_WITH_QUOTES
#ifdef DEBUG_TOKENISE_WITH_QUOTES
cerr << "Found " << ntokens << " tokens\n";
for (int i = 0; i < tstart.number_elements(); ++i) {
cerr << ' ' << i << " start " << tstart[i] << ' ' << buffer[tstart[i]] <<
" stop " << tstop[i] << ' ' << buffer[tstop[i]] << '\n';
}
#endif

return ntokens;
}
} // namespace iwstring
60 changes: 60 additions & 0 deletions src/Foundational/iwstring/tokenise_with_quotes_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@

#include "googlemock/include/gmock/gmock.h"
#include "googletest/include/gtest/gtest.h"

#include "iwstring.h"

namespace {

using iwstring::TokeniseWithQuotes;

struct Data {
IWString buffer;
char sep;
int ntokens;
std::vector<const char*> expected;
};

class TestTokenise: public testing::TestWithParam<Data> {
protected:
resizable_array<int> _tstart;
resizable_array<int> _tstop;
};

TEST_P(TestTokenise, TestTokenise) {
const auto params = GetParam();
EXPECT_EQ(TokeniseWithQuotes(params.buffer, params.sep, _tstart, _tstop), params.ntokens) <<
params.buffer;

// Expected failure encountered, cannot extract matching tokens.
if (params.ntokens < 0) {
return;
}

for (int i = 0; i < params.ntokens; ++i) {
int b = _tstart[i];
int e = _tstop[i];
// std::cerr << "b " << b << " e " << e << '\n';
const_IWSubstring token(params.buffer.rawdata() + b, e - b);
EXPECT_EQ(params.expected[i], token) << i << " mismatch '" << params.expected[i] <<
"' got '" << token << "' in " << params.buffer;
}
}
INSTANTIATE_TEST_SUITE_P(TestTokenise, TestTokenise, testing::Values(
Data{"a,b", ',', 2, {"a", "b"}},
Data{"aa,b", ',', 2, {"aa", "b"}},
Data{"aa,bb", ',', 2, {"aa", "bb"}},
Data{"aaa,bb", ',', 2, {"aaa", "bb"}},
Data{R"("a","b")", ',', 2, {"a", "b"}},
Data{R"("a a","b")", ',', 2, {"a a", "b"}},
Data{R"("a a","b b")", ',', 2, {"a a", "b b"}},
Data{R"(a,"b b")", ',', 2, {"a", "b b"}},
Data{R"("a a",b)", ',', 2, {"a a", "b"}},
Data{R"("a,a",b)", ',', 2, {"a,a", "b"}},
Data{R"("a,a",,b)", ',', 3, {"a,a", "", "b"}},
Data{R"("a,a",,b,)", ',', 4, {"a,a", "", "b", ""}},
Data{R"(,"a,a",,b,)", ',', -1, {"", "a,a", "", "b", ""}}
));


} // namespace
Loading

0 comments on commit 262fa14

Please sign in to comment.