From 262fa1420cfd93bf2af0a210d560e7282c60ac81 Mon Sep 17 00:00:00 2001
From: Ian Watson <ianiwatson@gmail.com>
Date: Thu, 21 Nov 2024 18:30:52 -0500
Subject: [PATCH] linear_fingerprint: ring bits

---
 contrib/bin/.gitignore                        |   1 +
 docs/Molecule_Tools/grep_molecule.md          |   7 +
 docs/Molecule_Tools/iwdescr.md                |   3 +-
 .../iwmisc/proto_for_testing.proto            |  24 ++++
 src/Foundational/iwstring/BUILD               |  13 ++
 src/Foundational/iwstring/iwstring.h          |  18 +++
 .../iwstring/tokenise_with_quotes.cc          | 105 ++++++++++++++
 .../iwstring/tokenise_with_quotes_test.cc     |  60 ++++++++
 src/Molecule_Lib/linear_fingerprint.cc        | 128 ++++++++++++------
 src/Molecule_Lib/linear_fingerprint.h         |   2 +
 src/Molecule_Tools/grep_molecule.cc           |   6 +-
 src/Molecule_Tools/linear_fingerprint_main.cc |  50 ++++---
 src/Utilities/General/iwcut.cc                |  39 +++---
 13 files changed, 370 insertions(+), 86 deletions(-)
 create mode 100644 contrib/bin/.gitignore
 create mode 100644 src/Foundational/iwmisc/proto_for_testing.proto
 create mode 100644 src/Foundational/iwstring/tokenise_with_quotes.cc
 create mode 100644 src/Foundational/iwstring/tokenise_with_quotes_test.cc

diff --git a/contrib/bin/.gitignore b/contrib/bin/.gitignore
new file mode 100644
index 00000000..ebf4281d
--- /dev/null
+++ b/contrib/bin/.gitignore
@@ -0,0 +1 @@
+!lib
diff --git a/docs/Molecule_Tools/grep_molecule.md b/docs/Molecule_Tools/grep_molecule.md
index bf50fa36..d4337988 100644
--- a/docs/Molecule_Tools/grep_molecule.md
+++ b/docs/Molecule_Tools/grep_molecule.md
@@ -125,6 +125,13 @@ Worst case would be finding a set of molecules in itself, which would require
 the full unique smiles computation. Nevertheless, doing this on 20k random
 molecules takes just 2.7 seconds.
 
+## Options
+Chirality can be removed from both the needle molecule and the molecules to
+be searched, add the -c option.
+
+Molecules can be reduced to the largest fragment via the -l option. Again,
+the transformation is applied to both the needle and the haystack molecules.
+
 ## Further Optimisation
 The number of chiral centres could be included with the aromatic molecular formula,
 or perhaps discerned from the starting smiles. If chirality is being considered
diff --git a/docs/Molecule_Tools/iwdescr.md b/docs/Molecule_Tools/iwdescr.md
index 4d91d0a3..2015f335 100644
--- a/docs/Molecule_Tools/iwdescr.md
+++ b/docs/Molecule_Tools/iwdescr.md
@@ -16,7 +16,6 @@ see various missing columns.
 ## Descriptors.
 The following descriptors are computed.
 
-| ---- | ---------- |
 | name | definition |
 | ---- | ---------- |
 | natoms | the number of atoms in the molecule |
@@ -323,7 +322,7 @@ off all optional descriptors. This can make a big difference in run times. Runni
 ```
 iwdescr.sh -O all file.smi > file.w
 ```
-takes 5.7 seconds to process 20k molecules, generating 264 columns of ouput. Running
+takes 5.7 seconds to process 20k molecules, generating 277 columns of ouput. Running
 ```
 iwdescr.sh -O none file.smi > file.w
 ```
diff --git a/src/Foundational/iwmisc/proto_for_testing.proto b/src/Foundational/iwmisc/proto_for_testing.proto
new file mode 100644
index 00000000..ff04c0b5
--- /dev/null
+++ b/src/Foundational/iwmisc/proto_for_testing.proto
@@ -0,0 +1,24 @@
+syntax = "proto3";
+
+package for_testing;
+
+message SubMessage {
+  optional int32 i1 = 1;
+  optional string str1 = 2;
+}
+
+message TestMessage {
+  optional string str1 = 1;
+  optional string str2 = 2;
+
+  optional int32 i1 = 3;
+  optional uint32 ui1 = 4;
+  optional float x = 5;
+
+  repeated int32 int_array = 6;
+  repeated float float_array = 7;
+
+  repeated string repeated_string = 8;
+
+  optional SubMessage sub_message = 9;
+}
diff --git a/src/Foundational/iwstring/BUILD b/src/Foundational/iwstring/BUILD
index 055acf7b..1ba5d85e 100644
--- a/src/Foundational/iwstring/BUILD
+++ b/src/Foundational/iwstring/BUILD
@@ -28,6 +28,7 @@ cc_library(
         "remove_suffix.cc",
         "string_change.cc",
         "string_relationals.cc",
+        "tokenise_with_quotes.cc",
         "unhtml.cc",
     ],
     hdrs = [
@@ -88,3 +89,15 @@ cc_test(
     ],
     timeout="short",
 )
+
+cc_test(
+    name = "tokenise_with_quotes_test",
+    srcs = [
+        "tokenise_with_quotes_test.cc",
+    ],
+    deps = [
+        ":iwstring",
+        "@googletest//:gtest",
+        "@googletest//:gtest_main",
+    ],
+)
diff --git a/src/Foundational/iwstring/iwstring.h b/src/Foundational/iwstring/iwstring.h
index 1349dc1e..282a8c31 100644
--- a/src/Foundational/iwstring/iwstring.h
+++ b/src/Foundational/iwstring/iwstring.h
@@ -1023,6 +1023,24 @@ Equals(const const_IWSubstring& lhs, const std::string_view& rhs) {
   return 0 == ::strncmp(lhs.data(), rhs.data(), lhs.length());
 }
 
+// Used for reading records from tabular files where there might
+// be quoted tokens.
+// For each token encountered, add to `tstart` and `tstop` the
+// start and stop for that token - excluding quotes.
+// The output can be processed with something like:
+//
+//  int ntokens = TokeniseWithQuotes(buffer, ',', _tstart, _tstop);
+//  if (ntokens < 0) .... fail.
+//  for (int i = 0; i < ntokens; ++i) {
+//    int b = _tstart[i];
+//    int e = _tstop[i];
+//    const_IWSubstring token(buffer.rawdata() + b, e - b);
+int
+TokeniseWithQuotes(const const_IWSubstring& buffer,
+                   char sep,
+                   resizable_array<int>& tstart,
+                   resizable_array<int>& tstop);
+
 }  // namespace iwstring
 
 inline std::ostream &
diff --git a/src/Foundational/iwstring/tokenise_with_quotes.cc b/src/Foundational/iwstring/tokenise_with_quotes.cc
new file mode 100644
index 00000000..3d749350
--- /dev/null
+++ b/src/Foundational/iwstring/tokenise_with_quotes.cc
@@ -0,0 +1,105 @@
+#include <iostream>
+
+#include "Foundational/iwstring/iwstring.h"
+
+namespace iwstring {
+
+using std::cerr;
+
+constexpr int kInvalid = -1;
+
+int
+TokeniseWithQuotes(const const_IWSubstring& buffer,
+                   char sep,
+                   resizable_array<int>& tstart,
+                   resizable_array<int>& tstop) {
+  tstart.resize_keep_storage(0);
+  tstop.resize_keep_storage(0);
+
+  static constexpr char kDQuote = '"';
+
+  const int nchars = buffer.length();
+  if (nchars == 0) {
+    return 0;
+  }
+
+  // Maybe allow for empty token at start?
+  if (buffer[0] == sep) {
+    return kInvalid;
+  }
+
+  bool inside_quoted_string = false;
+  int ntokens = 1;
+
+  if (buffer[0] == kDQuote) {
+    tstart << 1;
+    inside_quoted_string = true;
+  } else {
+    tstart << 0;
+  }
+
+  for (int i = 1; i < nchars; ++i) {
+    const char c = buffer[i];
+    char next_char;
+    if (i == nchars - 1) {
+      next_char = '\0';
+    } else {
+      next_char = buffer[i + 1];
+    }
+
+    if (inside_quoted_string) {
+      if (c == kDQuote && (next_char == sep || next_char == '\0')) {
+        inside_quoted_string = false;
+      }
+    } else if (c == sep) {
+      if (buffer[i-1] == kDQuote) {
+        tstop << (i - 1);
+      } else {
+        tstop << (i - 0);
+      }
+      if (next_char == kDQuote) {
+        tstart << (i + 2);
+      } else {
+        tstart << (i + 1);
+      }
+      ++ntokens;
+    } else if (c == kDQuote && buffer[i-1] == sep) {
+      inside_quoted_string = true;
+    }
+  }
+
+  if (inside_quoted_string) {
+    cerr << "TokeniseWithQuotes:unclosed quote '" << buffer << "'\n";
+    return kInvalid;
+  }
+
+  if (buffer.ends_with(kDQuote)) {
+    tstop << (buffer.length() - 1);
+  } else {
+    tstop << (buffer.length() - 0);
+  }
+
+  if (tstart.size() != tstop.size()) {
+    cerr << "TokeniseWithQuotes::Mismatch between opening and closing tokens\n";
+    cerr << tstart.size() << " vs " << tstop.size() << '\n';
+    return kInvalid;
+  }
+
+  if (tstart.number_elements() != ntokens) {
+    cerr << "TokeniseWithQuotes:Mismatch btw tokens " << ntokens <<
+            " and array size " << tstart.size() << '\n';
+    return kInvalid;
+  }
+
+// #define DEBUG_TOKENISE_WITH_QUOTES
+#ifdef DEBUG_TOKENISE_WITH_QUOTES
+  cerr << "Found " << ntokens << " tokens\n";
+  for (int i = 0; i < tstart.number_elements(); ++i) {
+    cerr << ' ' << i << " start " << tstart[i] << ' ' << buffer[tstart[i]] << 
+            " stop " << tstop[i] << ' ' << buffer[tstop[i]] << '\n';
+  }
+#endif
+
+  return ntokens;
+}
+}  // namespace iwstring
diff --git a/src/Foundational/iwstring/tokenise_with_quotes_test.cc b/src/Foundational/iwstring/tokenise_with_quotes_test.cc
new file mode 100644
index 00000000..e190e865
--- /dev/null
+++ b/src/Foundational/iwstring/tokenise_with_quotes_test.cc
@@ -0,0 +1,60 @@
+
+#include "googlemock/include/gmock/gmock.h"
+#include "googletest/include/gtest/gtest.h"
+
+#include "iwstring.h"
+
+namespace {
+
+using iwstring::TokeniseWithQuotes;
+
+struct Data {
+  IWString buffer;
+  char sep;
+  int ntokens;
+  std::vector<const char*> expected;
+};
+
+class TestTokenise: public testing::TestWithParam<Data> {
+  protected:
+    resizable_array<int> _tstart;
+    resizable_array<int> _tstop;
+};
+
+TEST_P(TestTokenise, TestTokenise) {
+  const auto params = GetParam();
+  EXPECT_EQ(TokeniseWithQuotes(params.buffer, params.sep, _tstart, _tstop), params.ntokens) <<
+                params.buffer;
+
+  // Expected failure encountered, cannot extract matching tokens.
+  if (params.ntokens < 0) {
+    return;
+  }
+
+  for (int i = 0; i < params.ntokens; ++i) {
+    int b = _tstart[i];
+    int e = _tstop[i];
+    // std::cerr << "b " << b << " e " << e << '\n';
+    const_IWSubstring token(params.buffer.rawdata() + b, e - b);
+    EXPECT_EQ(params.expected[i], token) << i << " mismatch '" << params.expected[i] <<
+                        "' got '" << token << "' in " << params.buffer;
+  }
+}
+INSTANTIATE_TEST_SUITE_P(TestTokenise, TestTokenise, testing::Values(
+  Data{"a,b", ',', 2, {"a", "b"}},
+  Data{"aa,b", ',', 2, {"aa", "b"}},
+  Data{"aa,bb", ',', 2, {"aa", "bb"}},
+  Data{"aaa,bb", ',', 2, {"aaa", "bb"}},
+  Data{R"("a","b")", ',', 2, {"a", "b"}},
+  Data{R"("a a","b")", ',', 2, {"a a", "b"}},
+  Data{R"("a a","b b")", ',', 2, {"a a", "b b"}},
+  Data{R"(a,"b b")", ',', 2, {"a", "b b"}},
+  Data{R"("a a",b)", ',', 2, {"a a", "b"}},
+  Data{R"("a,a",b)", ',', 2, {"a,a", "b"}},
+  Data{R"("a,a",,b)", ',', 3, {"a,a", "", "b"}},
+  Data{R"("a,a",,b,)", ',', 4, {"a,a", "", "b", ""}},
+  Data{R"(,"a,a",,b,)", ',', -1, {"", "a,a", "", "b", ""}}
+));
+
+
+}  // namespace
diff --git a/src/Molecule_Lib/linear_fingerprint.cc b/src/Molecule_Lib/linear_fingerprint.cc
index d055b73f..49064af1 100644
--- a/src/Molecule_Lib/linear_fingerprint.cc
+++ b/src/Molecule_Lib/linear_fingerprint.cc
@@ -12,7 +12,7 @@ using std::endl;
 
 namespace internal {
 
-constexpr int exclude_atom = -1;
+constexpr int kExcludeAtom = -1;
 
 Options::Options () 
 {
@@ -89,7 +89,7 @@ LinearFpStatus::LinearFpStatus(const Options& opt, const Molecule& m,
       if (include_atom[i])
         _atom_in_path[i] = 0;
       else
-        _atom_in_path[i] = exclude_atom;
+        _atom_in_path[i] = kExcludeAtom;
     }
 
     for (int i = 0; i < _nedges; ++i) {
@@ -133,6 +133,26 @@ LinearFpStatus::~LinearFpStatus()
   return;
 }
 
+int
+LinearFpStatus::DebugPrint(std::ostream& output) const {
+  output << "LinearFpStatus:path length " << _path_length << '\n';
+  if (_path_length == 0) {
+    return output.good();
+  }
+
+  output << "0 atom " << _path_index[0] << '\n';
+  for (int i = 1; i < _path_length; i += 2) {
+    output << i << " bond " << _path_index[i] << " atom " << _path_index[i + 1] << '\n';
+  }
+
+  for (int i = 0; i < _matoms; ++i) {
+    output << " atom " << i << " in path " << _atom_in_path[i] << '\n';
+  }
+
+
+  return output.good();
+}
+
 uint64_t
 LinearFpStatus::_BondHash(const Bond& b) const {
   if (b.is_aromatic())
@@ -160,6 +180,7 @@ LinearFpStatus::_AddBondToPath(const Bond & b, const atom_number_t next_atom) {
 
 #ifdef DEBUG_LINEAR_FP
   cerr << "At length " << _path_length << " adding bond number " << bond_number << " value " << _bond_constant[bond_number] << endl;
+  cerr << "_AddBondToPath adding atom " << next_atom << " length " << _path_length << '\n';
 #endif
 
   _path[_path_length] = _bond_constant[bond_number];
@@ -186,6 +207,9 @@ LinearFpStatus::_PopPath() {
   const int atom_number = _path_index[_path_length];
   assert(_atom_in_path[atom_number]);
   _atom_in_path[atom_number]--;
+#ifdef DEBUG_EXPAND
+  cerr << "_PopPath removing atom " << atom_number << " at length " << _path_length << '\n';
+#endif
 
   _path_length--;
 
@@ -223,13 +247,14 @@ LinearFpStatus::Fingerprint() {
     return 1;
   }
 
-  if (nullptr != _stream_for_bit_meanings)
+  if (nullptr != _stream_for_bit_meanings) {
     _WriteLabelledSmiles();
+  }
 
-  for (int i = 0; i < _matoms; ++i)
-  {
-    if (exclude_atom == _atom_in_path[i])
+  for (int i = 0; i < _matoms; ++i) {
+    if (kExcludeAtom == _atom_in_path[i]) {
       continue;
+    }
 
     _StartPath(i);
     _MaybeFormBit();
@@ -249,37 +274,56 @@ LinearFpStatus::Fingerprint() {
 void
 LinearFpStatus::_Expand() 
 {
-  if (_path_length / 2 >= _options._max_length)
+  if (_path_length / 2 >= _options._max_length) {
     return;
+  }
 
-  const atom_number_t a1 = _path_index[_path_length - 1];
+  // Extract to single variable to avoid complex comparisons in the loop.
+  bool rings_or_crossing_paths;
+  if (_path_length / 2 < 3) {
+    rings_or_crossing_paths = false;
+  } else {
+    rings_or_crossing_paths = (_options._fingerprint_ring_presence ||
+                               _options._paths_can_cross);
+  }
 
-  const Atom * a = _atom[a1];
+  const atom_number_t a1 = _path_index[_path_length - 1];
+#ifdef DEBUG_EXPAND
+  cerr << "Expand from atom " << a1 << '\n';
+  DebugPrint(cerr);
+#endif
 
-  const int acon = a->ncon();
+  const Atom* a = _atom[a1];
 
-  for (int i = 0; i < acon; ++i) {
-    const Bond * b = a->item(i);
-    if (_bond_in_path[b->bond_number()])
+  for (const Bond* b : *a) {
+    if (_bond_in_path[b->bond_number()]) {  // catches return to previous atom.
       continue;
+    }
 
     const atom_number_t a2 = b->other(a1);
-    if (exclude_atom == _atom_in_path[a2])
+    if (kExcludeAtom == _atom_in_path[a2]) {
       continue;
+    }
 
-    bool a2_already_in_path;
-    if (!_atom_in_path[a2])  // The easy case
-      a2_already_in_path = false;
-    else if (_options._fingerprint_ring_presence ||
-             _options._paths_can_cross)
-      a2_already_in_path = true;
-    else  // Avoid placed atom.
+    bool a2_already_in_path = false;
+    if (! _atom_in_path[a2]) {
+      // New atom, great.
+    } else if (! rings_or_crossing_paths) {
+      // In path, but not doing anything with rings or crossing paths.
       continue;
+    } else {  // Already in path, maybe ring and/or crossing path.
+      if (_options._fingerprint_ring_presence) {
+        _AddBondToPath(*b, a2);
+        _FormRingBit();
+        _PopPath();
+      }
+      // TODO:ianwatson implement crossing path idea.
+      continue;
+    }
 
     _AddBondToPath(*b, a2);
     _MaybeFormBit();
-    if (a2_already_in_path) 
-    {
+    if (a2_already_in_path) {
       if (_options._fingerprint_ring_presence)
         _FormRingBit();
       if (!_options._paths_can_cross)
@@ -346,34 +390,32 @@ LinearFpStatus::_MaybeFormBit() {
 
 // Atom at end of path occurs somewhere previously. Find it.
 void
-LinearFpStatus::_FormRingBit()
-{
+LinearFpStatus::_FormRingBit() {
   const int target = _path_index[_path_length - 1];
 
-  int first_index = -1;
-  for (int i = 0; i < (_path_length - 1); i += 2)
-  {
-    if (_path_index[i] == target)
-    {
-      first_index = i;
+  int last_index = -1;
+  for (int i = _path_length -3; i >= 0; i -= 2) {
+    if (_path_index[i] == target) {
+      last_index = i;
       break;
     }
   }
 
-  if (first_index < 0) 
-  {
-    cerr << "LinearFpStatus:_FormRingBit:first occurrence not found\n";
+  if (last_index < 0) {
+    cerr << "LinearFpStatus:_FormRingBit:first occurrence not found, target " << target << '\n';
     _PrintPath(cerr);
     return;
   }
 
-  uint64_t t1 = _path[first_index];
+  uint64_t t1 = _path[last_index];
   uint64_t t2 = _path[_path_length - 1];
 
-  if (t1 < t2)
+  if (t1 < t2) {
     std::swap(t1, t2);
+  }
 
-  _sfc.hit_bit(_magic1 * t1 + (_path_length - first_index) * (t2 + _magic2));
+  // Open question, should we include the bond type in this calculation?
+  _sfc.hit_bit(_magic1 * t1 + (_path_length - last_index) * (t2 + _magic2));
 }
 
 void
@@ -410,20 +452,22 @@ LinearFpStatus::_FormFingerprintBackward()
 }
 
 void
-LinearFpStatus::_ExamineBit(const uint64_t b)
-{
+LinearFpStatus::_ExamineBit(const uint64_t b) {
 #ifdef DEBUG_LINEAR_FP
   cerr << "Formed bit " << b <<endl;
 #endif
 
-  if (! _need_to_examine_bits_formed)
+  if (! _need_to_examine_bits_formed) {
     return;
+  }
 
-  if (_options._check_coverage)
+  if (_options._check_coverage) {
     _DoCheckCoverage();
+  }
 
-  if (nullptr != _stream_for_bit_meanings)
+  if (nullptr != _stream_for_bit_meanings) {
     _WriteBit(b);
+  }
 
   return;
 }
diff --git a/src/Molecule_Lib/linear_fingerprint.h b/src/Molecule_Lib/linear_fingerprint.h
index e7887d3d..c758c543 100644
--- a/src/Molecule_Lib/linear_fingerprint.h
+++ b/src/Molecule_Lib/linear_fingerprint.h
@@ -142,6 +142,8 @@ class LinearFpStatus {
     // Called once for each molecule so we have ready access to atom numbers
     // during debugging.
     void _WriteLabelledSmiles() const;
+
+    int DebugPrint(std::ostream& output) const;
     
   public:
     LinearFpStatus(const Options& options, const Molecule& m,
diff --git a/src/Molecule_Tools/grep_molecule.cc b/src/Molecule_Tools/grep_molecule.cc
index 5eb0f70b..96e10653 100644
--- a/src/Molecule_Tools/grep_molecule.cc
+++ b/src/Molecule_Tools/grep_molecule.cc
@@ -36,8 +36,10 @@ are files to be searched.
 Multiple patterns (smiles) can be specified on the command line separated by commas
   'C methane,CC ethane,CCC propane'
 will search for methane, ethane and propane in subsequent files. Quotes are essential.
- -f <fname>             like the -f option to grep, read the patterns (smiles) from <fname>
- -v                     verbose output
+ -f <fname>             like the -f option to grep, read the patterns (smiles) from <fname>.
+ -c                     remove chirality before comparing.
+ -l                     reduce to largest fragment.
+ -v                     verbose output.
   )";
 
   ::exit(rc);
diff --git a/src/Molecule_Tools/linear_fingerprint_main.cc b/src/Molecule_Tools/linear_fingerprint_main.cc
index d4015a5d..843d45a4 100644
--- a/src/Molecule_Tools/linear_fingerprint_main.cc
+++ b/src/Molecule_Tools/linear_fingerprint_main.cc
@@ -67,32 +67,33 @@ usage(int rc)
 #endif
   // clang-format on
   // clang-format off
-  cerr << "Computes linear path fingerprints\n";
-  cerr << "  -r <rad>      minimum path length (def 0)\n";
-  cerr << "  -R <rad>      maximum path length (def 7)\n";
-  cerr << "  -P ...        atom type specification\n";
-  cerr << "  -J <tag>      tag for fingerprints\n";
-  cerr << "  -f            function as a TDT filter\n";
-  cerr << "  -X <fname>    look for bits in <fname> and provide explanations\n";
-  cerr << "  -B <fname>    write all bits found to <fname>\n";
-  cerr << "  -y            check for bit collisions\n";
-  cerr << "  -s            gather statistics on molecules processed\n";
-  cerr << "  -c            produce isotopically labelled smiles with coverage\n";
-  cerr << "  -x            allow linear paths can cross\n";
-  cerr << "  -l            reduce to largest fragment\n";
-  cerr << "  -i <type>     input specification\n";
-  cerr << "  -g ...        chemical standardisation options\n";
-  cerr << "  -E ...        standard element specifications\n";
-  cerr << "  -A ...        standard aromaticity specifications\n";
-  cerr << "  -v            verbose output\n";
+  cerr << R"(Computes linear path fingerprints
+ -r <rad>      minimum path length (def 0)
+ -R <rad>      maximum path length (def 7)
+ -P ...        atom type specification
+ -J <tag>      tag for fingerprints
+ -f            function as a TDT filter
+ -X <fname>    look for bits in <fname> and provide explanations
+ -B <fname>    write all bits found to <fname>
+ -y            check for bit collisions
+ -s            gather statistics on molecules processed
+ -c            produce isotopically labelled smiles with coverage
+ -x            allow linear paths can cross
+ -w            set ring bits when a path forms a ring
+ -l            reduce to largest fragment
+ -i <type>     input specification
+ -g ...        chemical standardisation options
+ -E ...        standard element specifications
+ -A ...        standard aromaticity specifications
+ -v            verbose output
+)";
   // clang-format on
 
   exit(rc);
 }
 
 void
-Preprocess(Molecule & m)
-{
+Preprocess(Molecule & m) {
   if (reduce_to_largest_fragment)
     m.reduce_to_largest_fragment();
 
@@ -268,7 +269,7 @@ LinearFingerprint(const char * fname, FileType input_type,
 int
 LinearFingerprint(int argc, char ** argv)
 {
-  Command_Line cl(argc, argv, "E:A:K:lg:i:J:P:vfr:R:ysB:cx");
+  Command_Line cl(argc, argv, "E:A:K:lg:i:J:P:vfr:R:ysB:cxw");
 
   if (cl.unrecognised_options_encountered())
     usage(1);
@@ -397,6 +398,13 @@ LinearFingerprint(int argc, char ** argv)
       cerr << "Paths can cross\n";
   }
 
+  if (cl.option_present('w')) {
+    linear_fp_gen.set_fingerprint_ring_presence(true);
+    if (verbose) {
+      cerr << "Will set bits for presence of rings\n";
+    }
+  }
+
   if (cl.option_present('B')) {
     const char * fname = cl.option_value('B');
     if (!linear_fp_gen.OpenStreamForBitMeanings(fname)) {
diff --git a/src/Utilities/General/iwcut.cc b/src/Utilities/General/iwcut.cc
index d8e3b713..dc17cd6d 100644
--- a/src/Utilities/General/iwcut.cc
+++ b/src/Utilities/General/iwcut.cc
@@ -6,19 +6,19 @@
                      + __GNUC_MINOR__ * 100 \
                                           + __GNUC_PATCHLEVEL__)
 
+#include <algorithm>
+#include <cctype>
 #include <iostream>
 #include <memory>
-#include <algorithm>
 #include <random>
-#include <cctype>
-using std::cerr;
-using std::endl;
 
 #include "Foundational/cmdline/cmdline.h"
 #include "Foundational/data_source/iwstring_data_source.h"
 #include "Foundational/iwmisc/misc.h"
 #include "Foundational/iwmisc/iwre2.h"
 
+using std::cerr;
+
 static void
 usage(int rc)
 {
@@ -710,7 +710,7 @@ iwcut(const const_IWSubstring & buffer,
   {
     cerr << ' ' << word_beginnings[i];
   }
-  cerr << endl;
+  cerr << '\n';
 #endif
 
   for (int i = 0; i < nr; i++)
@@ -797,7 +797,7 @@ locate_quoted_tokens_word_beginnings(const const_IWSubstring & buffer,
 
   for (int i = 1; i < n; ++i)
   {
-//  cerr << " char " << i << " '" << buffer[i] << "' quote " << in_quote << endl;
+//  cerr << " char " << i << " '" << buffer[i] << "' quote " << in_quote << '\n';
     if (dquote == buffer[i])
       in_quote = ! in_quote;
     else if (in_quote)
@@ -809,7 +809,7 @@ locate_quoted_tokens_word_beginnings(const const_IWSubstring & buffer,
 #ifdef DEBUG_QUOTED_WB
   for (int i = 0; i < word_beginnings.size(); ++i)
   {
-    cerr << " wb " << i << ' ' << word_beginnings[i] << endl;
+    cerr << " wb " << i << ' ' << word_beginnings[i] << '\n';
   }
 #endif
 
@@ -828,7 +828,7 @@ iwcut(const const_IWSubstring & buffer,
   if (columns_in_input > 0)
     word_beginnings.resize(columns_in_input);
 
-//cerr << "Line " << __LINE__ << " iqt " << input_is_quoted_tokens << endl;
+//cerr << "Line " << __LINE__ << " iqt " << input_is_quoted_tokens << '\n';
 
   int ncol;
   if (input_is_quoted_tokens)
@@ -840,7 +840,7 @@ iwcut(const const_IWSubstring & buffer,
 
 #ifdef DEBUG_IWCUT
   cerr << "Processing '" << buffer << "'\n";
-  cerr << "ncol " << ncol << " count " << buffer.ccount(input_token_separator) << endl;
+  cerr << "ncol " << ncol << " count " << buffer.ccount(input_token_separator) << '\n';
 #endif
 
   if (ncol > columns_in_input)
@@ -944,7 +944,7 @@ find_column_number(const IWString & descriptor,
     columns_requested.add_if_not_already_present(i);
 
     if (verbose > 1)
-      cerr << "Descriptor '" << d << " in column " << (i + 1) << endl;
+      cerr << "Descriptor '" << d << " in column " << (i + 1) << '\n';
 
     return 1;
   }
@@ -968,8 +968,9 @@ identify_column (const IWString & descriptor,
                  resizable_array<int> & columns_requested)
         
 {
-  if (! match_descriptor_names_as_regular_expressions)
+  if (! match_descriptor_names_as_regular_expressions) {
     return find_column_number(descriptor, header, columns_requested);
+  }
 
   re2::StringPiece tmp(descriptor.data(), descriptor.length());
   RE2 rx(tmp);
@@ -995,7 +996,7 @@ identify_column (const IWString & descriptor,
       rc++;
 
       if (verbose > 1)
-        cerr << "Descriptor '" << d << " in column " << (i + 1) << endl;
+        cerr << "Descriptor '" << d << " in column " << (i + 1) << '\n';
     }
   }
 
@@ -1014,7 +1015,7 @@ do_split(const const_IWSubstring & buffer,
 
   const char dquote = '"';
 
-//cerr << "Looking for wb in '" << buffer << endl;
+//cerr << "Looking for wb in '" << buffer << '\n';
 
   int previous_delimiter = -1;    // 
 
@@ -1087,7 +1088,7 @@ determine_descriptors_to_be_output(const const_IWSubstring & buffer,
   cerr << "header split into " << header.size() << " items\n";
   for (int i = 0; i < header.number_elements(); ++i)
   {
-    cerr << " col " << i << " dname " << *header[i] << endl;
+    cerr << " col " << i << " dname " << *header[i] << '\n';
   }
 #endif
 
@@ -1127,7 +1128,7 @@ determine_descriptors_to_be_output(const const_IWSubstring & buffer,
       continue;
     }
 
-    cerr << buffer << endl;
+    cerr << buffer << '\n';
     rc = 0;
   }
 
@@ -1188,7 +1189,7 @@ iwcut(iwstring_data_source & input,
 
     if (! iwcut(buffer, columns_requested, output))
     {
-      cerr << "Fatal error on line " << input.lines_read() << endl;
+      cerr << "Fatal error on line " << input.lines_read() << '\n';
       return 0;
     }
 
@@ -1380,7 +1381,7 @@ iwcut (int argc, char ** argv)
 //      {
 //        cerr << ' ' << (columns_requested[i] + 1);
 //      }
-//      cerr << endl;
+//      cerr << '\n';
 //    }
   }
 
@@ -1523,7 +1524,7 @@ iwcut (int argc, char ** argv)
     cerr << "Will extract these descriptors\n";
     for (int i = 0; i < descriptors_requested.number_elements(); i++)
     {
-      cerr << ' ' << *(descriptors_requested[i]) << endl;
+      cerr << ' ' << *(descriptors_requested[i]) << '\n';
     }
   }
 
@@ -1532,7 +1533,7 @@ iwcut (int argc, char ** argv)
     cerr << "Will extract these columns\n";
     for (int i = 0; i < columns_requested.number_elements(); i++)
     {
-      cerr << ' ' << (columns_requested[i] + 1) << endl;
+      cerr << ' ' << (columns_requested[i] + 1) << '\n';
     }
   }