linear_fingerprint: ring bits

EliLillyCo · Nov 21, 2024 · 262fa14 · 262fa14
1 parent f5f7ed9
commit 262fa14
Show file tree

Hide file tree

Showing 13 changed files with 370 additions and 86 deletions.
diff --git a/contrib/bin/.gitignore b/contrib/bin/.gitignore
@@ -0,0 +1 @@
+!lib
diff --git a/docs/Molecule_Tools/grep_molecule.md b/docs/Molecule_Tools/grep_molecule.md
@@ -125,6 +125,13 @@ Worst case would be finding a set of molecules in itself, which would require
 the full unique smiles computation. Nevertheless, doing this on 20k random
 molecules takes just 2.7 seconds.
 
+## Options
+Chirality can be removed from both the needle molecule and the molecules to
+be searched, add the -c option.
+
+Molecules can be reduced to the largest fragment via the -l option. Again,
+the transformation is applied to both the needle and the haystack molecules.
+
 ## Further Optimisation
 The number of chiral centres could be included with the aromatic molecular formula,
 or perhaps discerned from the starting smiles. If chirality is being considered

diff --git a/docs/Molecule_Tools/iwdescr.md b/docs/Molecule_Tools/iwdescr.md
@@ -16,7 +16,6 @@ see various missing columns.
 ## Descriptors.
 The following descriptors are computed.
 
-| ---- | ---------- |
 | name | definition |
 | ---- | ---------- |
 | natoms | the number of atoms in the molecule |
@@ -323,7 +322,7 @@ off all optional descriptors. This can make a big difference in run times. Runni
 ```
 iwdescr.sh -O all file.smi > file.w
 ```
-takes 5.7 seconds to process 20k molecules, generating 264 columns of ouput. Running
+takes 5.7 seconds to process 20k molecules, generating 277 columns of ouput. Running
 ```
 iwdescr.sh -O none file.smi > file.w
 ```

diff --git a/src/Foundational/iwmisc/proto_for_testing.proto b/src/Foundational/iwmisc/proto_for_testing.proto
@@ -0,0 +1,24 @@
+syntax = "proto3";
+
+package for_testing;
+
+message SubMessage {
+  optional int32 i1 = 1;
+  optional string str1 = 2;
+}
+
+message TestMessage {
+  optional string str1 = 1;
+  optional string str2 = 2;
+
+  optional int32 i1 = 3;
+  optional uint32 ui1 = 4;
+  optional float x = 5;
+
+  repeated int32 int_array = 6;
+  repeated float float_array = 7;
+
+  repeated string repeated_string = 8;
+
+  optional SubMessage sub_message = 9;
+}
diff --git a/src/Foundational/iwstring/BUILD b/src/Foundational/iwstring/BUILD
@@ -28,6 +28,7 @@ cc_library(
         "remove_suffix.cc",
         "string_change.cc",
         "string_relationals.cc",
+        "tokenise_with_quotes.cc",
         "unhtml.cc",
     ],
     hdrs = [
@@ -88,3 +89,15 @@ cc_test(
     ],
     timeout="short",
 )
+
+cc_test(
+    name = "tokenise_with_quotes_test",
+    srcs = [
+        "tokenise_with_quotes_test.cc",
+    ],
+    deps = [
+        ":iwstring",
+        "@googletest//:gtest",
+        "@googletest//:gtest_main",
+    ],
+)
diff --git a/src/Foundational/iwstring/iwstring.h b/src/Foundational/iwstring/iwstring.h
@@ -1023,6 +1023,24 @@ Equals(const const_IWSubstring& lhs, const std::string_view& rhs) {
   return 0 == ::strncmp(lhs.data(), rhs.data(), lhs.length());
 }
 
+// Used for reading records from tabular files where there might
+// be quoted tokens.
+// For each token encountered, add to `tstart` and `tstop` the
+// start and stop for that token - excluding quotes.
+// The output can be processed with something like:
+//
+//  int ntokens = TokeniseWithQuotes(buffer, ',', _tstart, _tstop);
+//  if (ntokens < 0) .... fail.
+//  for (int i = 0; i < ntokens; ++i) {
+//    int b = _tstart[i];
+//    int e = _tstop[i];
+//    const_IWSubstring token(buffer.rawdata() + b, e - b);
+int
+TokeniseWithQuotes(const const_IWSubstring& buffer,
+                   char sep,
+                   resizable_array<int>& tstart,
+                   resizable_array<int>& tstop);
+
 }  // namespace iwstring
 
 inline std::ostream &

diff --git a/src/Foundational/iwstring/tokenise_with_quotes.cc b/src/Foundational/iwstring/tokenise_with_quotes.cc
@@ -0,0 +1,105 @@
+#include <iostream>
+
+#include "Foundational/iwstring/iwstring.h"
+
+namespace iwstring {
+
+using std::cerr;
+
+constexpr int kInvalid = -1;
+
+int
+TokeniseWithQuotes(const const_IWSubstring& buffer,
+                   char sep,
+                   resizable_array<int>& tstart,
+                   resizable_array<int>& tstop) {
+  tstart.resize_keep_storage(0);
+  tstop.resize_keep_storage(0);
+
+  static constexpr char kDQuote = '"';
+
+  const int nchars = buffer.length();
+  if (nchars == 0) {
+    return 0;
+  }
+
+  // Maybe allow for empty token at start?
+  if (buffer[0] == sep) {
+    return kInvalid;
+  }
+
+  bool inside_quoted_string = false;
+  int ntokens = 1;
+
+  if (buffer[0] == kDQuote) {
+    tstart << 1;
+    inside_quoted_string = true;
+  } else {
+    tstart << 0;
+  }
+
+  for (int i = 1; i < nchars; ++i) {
+    const char c = buffer[i];
+    char next_char;
+    if (i == nchars - 1) {
+      next_char = '\0';
+    } else {
+      next_char = buffer[i + 1];
+    }
+
+    if (inside_quoted_string) {
+      if (c == kDQuote && (next_char == sep || next_char == '\0')) {
+        inside_quoted_string = false;
+      }
+    } else if (c == sep) {
+      if (buffer[i-1] == kDQuote) {
+        tstop << (i - 1);
+      } else {
+        tstop << (i - 0);
+      }
+      if (next_char == kDQuote) {
+        tstart << (i + 2);
+      } else {
+        tstart << (i + 1);
+      }
+      ++ntokens;
+    } else if (c == kDQuote && buffer[i-1] == sep) {
+      inside_quoted_string = true;
+    }
+  }
+
+  if (inside_quoted_string) {
+    cerr << "TokeniseWithQuotes:unclosed quote '" << buffer << "'\n";
+    return kInvalid;
+  }
+
+  if (buffer.ends_with(kDQuote)) {
+    tstop << (buffer.length() - 1);
+  } else {
+    tstop << (buffer.length() - 0);
+  }
+
+  if (tstart.size() != tstop.size()) {
+    cerr << "TokeniseWithQuotes::Mismatch between opening and closing tokens\n";
+    cerr << tstart.size() << " vs " << tstop.size() << '\n';
+    return kInvalid;
+  }
+
+  if (tstart.number_elements() != ntokens) {
+    cerr << "TokeniseWithQuotes:Mismatch btw tokens " << ntokens <<
+            " and array size " << tstart.size() << '\n';
+    return kInvalid;
+  }
+
+// #define DEBUG_TOKENISE_WITH_QUOTES
+#ifdef DEBUG_TOKENISE_WITH_QUOTES
+  cerr << "Found " << ntokens << " tokens\n";
+  for (int i = 0; i < tstart.number_elements(); ++i) {
+    cerr << ' ' << i << " start " << tstart[i] << ' ' << buffer[tstart[i]] << 
+            " stop " << tstop[i] << ' ' << buffer[tstop[i]] << '\n';
+  }
+#endif
+
+  return ntokens;
+}
+}  // namespace iwstring
diff --git a/src/Foundational/iwstring/tokenise_with_quotes_test.cc b/src/Foundational/iwstring/tokenise_with_quotes_test.cc
@@ -0,0 +1,60 @@
+
+#include "googlemock/include/gmock/gmock.h"
+#include "googletest/include/gtest/gtest.h"
+
+#include "iwstring.h"
+
+namespace {
+
+using iwstring::TokeniseWithQuotes;
+
+struct Data {
+  IWString buffer;
+  char sep;
+  int ntokens;
+  std::vector<const char*> expected;
+};
+
+class TestTokenise: public testing::TestWithParam<Data> {
+  protected:
+    resizable_array<int> _tstart;
+    resizable_array<int> _tstop;
+};
+
+TEST_P(TestTokenise, TestTokenise) {
+  const auto params = GetParam();
+  EXPECT_EQ(TokeniseWithQuotes(params.buffer, params.sep, _tstart, _tstop), params.ntokens) <<
+                params.buffer;
+
+  // Expected failure encountered, cannot extract matching tokens.
+  if (params.ntokens < 0) {
+    return;
+  }
+
+  for (int i = 0; i < params.ntokens; ++i) {
+    int b = _tstart[i];
+    int e = _tstop[i];
+    // std::cerr << "b " << b << " e " << e << '\n';
+    const_IWSubstring token(params.buffer.rawdata() + b, e - b);
+    EXPECT_EQ(params.expected[i], token) << i << " mismatch '" << params.expected[i] <<
+                        "' got '" << token << "' in " << params.buffer;
+  }
+}
+INSTANTIATE_TEST_SUITE_P(TestTokenise, TestTokenise, testing::Values(
+  Data{"a,b", ',', 2, {"a", "b"}},
+  Data{"aa,b", ',', 2, {"aa", "b"}},
+  Data{"aa,bb", ',', 2, {"aa", "bb"}},
+  Data{"aaa,bb", ',', 2, {"aaa", "bb"}},
+  Data{R"("a","b")", ',', 2, {"a", "b"}},
+  Data{R"("a a","b")", ',', 2, {"a a", "b"}},
+  Data{R"("a a","b b")", ',', 2, {"a a", "b b"}},
+  Data{R"(a,"b b")", ',', 2, {"a", "b b"}},
+  Data{R"("a a",b)", ',', 2, {"a a", "b"}},
+  Data{R"("a,a",b)", ',', 2, {"a,a", "b"}},
+  Data{R"("a,a",,b)", ',', 3, {"a,a", "", "b"}},
+  Data{R"("a,a",,b,)", ',', 4, {"a,a", "", "b", ""}},
+  Data{R"(,"a,a",,b,)", ',', -1, {"", "a,a", "", "b", ""}}
+));
+
+
+}  // namespace