From c7118b3a5cceb826de436f8c7064d9d8ed997c54 Mon Sep 17 00:00:00 2001 From: Sameer Srivastava Date: Thu, 4 Apr 2024 13:51:23 +0200 Subject: [PATCH] Add text_grep to indexer cargo workspace --- application/apps/indexer/Cargo.toml | 1 + .../apps/{ => indexer}/text_grep/.gitignore | 0 application/apps/indexer/text_grep/Cargo.lock | 727 ++++++++++++++++++ .../apps/{ => indexer}/text_grep/Cargo.toml | 5 +- application/apps/indexer/text_grep/README.md | 110 +++ .../{ => indexer}/text_grep/src/buffer.rs | 2 +- application/apps/indexer/text_grep/src/lib.rs | 268 +++++++ .../indexer/text_grep/tests/grep_tests.rs | 125 +++ .../text_grep/src/grep_searcher_no_chunks.rs | 196 ----- .../src/grep_searcher_with_chunks.rs | 233 ------ application/apps/text_grep/src/lib.rs | 164 ---- .../apps/text_grep/src/wc_impl2_draft.rs | 230 ------ .../apps/text_grep/tests/grep_tests.rs | 106 --- 13 files changed, 1234 insertions(+), 933 deletions(-) rename application/apps/{ => indexer}/text_grep/.gitignore (100%) create mode 100644 application/apps/indexer/text_grep/Cargo.lock rename application/apps/{ => indexer}/text_grep/Cargo.toml (87%) create mode 100644 application/apps/indexer/text_grep/README.md rename application/apps/{ => indexer}/text_grep/src/buffer.rs (99%) create mode 100644 application/apps/indexer/text_grep/src/lib.rs create mode 100644 application/apps/indexer/text_grep/tests/grep_tests.rs delete mode 100644 application/apps/text_grep/src/grep_searcher_no_chunks.rs delete mode 100644 application/apps/text_grep/src/grep_searcher_with_chunks.rs delete mode 100644 application/apps/text_grep/src/lib.rs delete mode 100644 application/apps/text_grep/src/wc_impl2_draft.rs delete mode 100644 application/apps/text_grep/tests/grep_tests.rs diff --git a/application/apps/indexer/Cargo.toml b/application/apps/indexer/Cargo.toml index 98a1d5bc9..4f4f16cde 100644 --- a/application/apps/indexer/Cargo.toml +++ b/application/apps/indexer/Cargo.toml @@ -11,6 +11,7 @@ members = [ "processor", "session", "sources", + "text_grep", ] # only uncomment when profiling diff --git a/application/apps/text_grep/.gitignore b/application/apps/indexer/text_grep/.gitignore similarity index 100% rename from application/apps/text_grep/.gitignore rename to application/apps/indexer/text_grep/.gitignore diff --git a/application/apps/indexer/text_grep/Cargo.lock b/application/apps/indexer/text_grep/Cargo.lock new file mode 100644 index 000000000..14d5226ad --- /dev/null +++ b/application/apps/indexer/text_grep/Cargo.lock @@ -0,0 +1,727 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "backtrace" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" + +[[package]] +name = "bstr" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "buf_redux" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f" +dependencies = [ + "memchr", + "safemem", + "slice-deque", +] + +[[package]] +name = "bytes" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "encoding_rs" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-sink" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" + +[[package]] +name = "gimli" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" + +[[package]] +name = "grep-matcher" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47a3141a10a43acfedc7c98a60a834d7ba00dfe7bec9071cbfc19b55b292ac02" +dependencies = [ + "memchr", +] + +[[package]] +name = "grep-regex" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f748bb135ca835da5cbc67ca0e6955f968db9c5df74ca4f56b18e1ddbc68230d" +dependencies = [ + "bstr", + "grep-matcher", + "log", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "grep-searcher" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba536ae4f69bec62d8839584dd3153d3028ef31bb229f04e09fb5a9e5a193c54" +dependencies = [ + "bstr", + "encoding_rs", + "encoding_rs_io", + "grep-matcher", + "log", + "memchr", + "memmap2", +] + +[[package]] +name = "hermit-abi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd5256b483761cd23699d0da46cc6fd2ee3be420bbe6d020ae4a091e70b7e9fd" + +[[package]] +name = "libc" +version = "0.2.153" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" + +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + +[[package]] +name = "lock_api" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "mach" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86dd2487cdfea56def77b88438a2c915fb45113c5319bfe7e14306ca4cd0b0e1" +dependencies = [ + "libc", +] + +[[package]] +name = "memchr" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" + +[[package]] +name = "memmap2" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" +dependencies = [ + "libc", +] + +[[package]] +name = "miniz_oxide" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.48.0", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "object" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +dependencies = [ + "memchr", +] + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.48.5", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + +[[package]] +name = "proc-macro2" +version = "1.0.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "regex" +version = "1.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + +[[package]] +name = "rustix" +version = "0.38.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +dependencies = [ + "bitflags 2.4.2", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "safemem" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.196" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "870026e60fa08c69f064aa766c10f10b1d62db9ccd4d0abb206472bee0ce3b32" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.196" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +dependencies = [ + "libc", +] + +[[package]] +name = "slice-deque" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffddf594f5f597f63533d897427a570dbaa9feabaaa06595b74b71b7014507d7" +dependencies = [ + "libc", + "mach", + "winapi", +] + +[[package]] +name = "smallvec" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" + +[[package]] +name = "socket2" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9" +dependencies = [ + "libc", + "windows-sys 0.48.0", +] + +[[package]] +name = "syn" +version = "2.0.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a365e8cd18e44762ef95d87f284f4b5cd04107fec2ff3052bd6a3e6069669e67" +dependencies = [ + "cfg-if", + "fastrand", + "rustix", + "windows-sys 0.52.0", +] + +[[package]] +name = "text_grep" +version = "0.1.0" +dependencies = [ + "buf_redux", + "grep-regex", + "grep-searcher", + "regex", + "tempfile", + "thiserror", + "tokio", + "tokio-util", +] + +[[package]] +name = "thiserror" +version = "1.0.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio" +version = "1.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "num_cpus", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.48.0", +] + +[[package]] +name = "tokio-macros" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-util" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.0", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" diff --git a/application/apps/text_grep/Cargo.toml b/application/apps/indexer/text_grep/Cargo.toml similarity index 87% rename from application/apps/text_grep/Cargo.toml rename to application/apps/indexer/text_grep/Cargo.toml index ed135e229..2dedbada9 100644 --- a/application/apps/text_grep/Cargo.toml +++ b/application/apps/indexer/text_grep/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" [lib] name = "text_grep" crate-type = ["lib"] +path = "src/lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] @@ -13,9 +14,7 @@ tokio = { version = "1", features = ["full"] } buf_redux = "0.8.4" tokio-util = "0.7.10" tempfile = "3.10.0" -rayon = "1.8.1" -aho-corasick = "1.1.2" grep-searcher = "0.1.13" grep-regex = "0.1.12" thiserror = "1.0.57" -regex = "1.10.3" +regex = "1.10.3" \ No newline at end of file diff --git a/application/apps/indexer/text_grep/README.md b/application/apps/indexer/text_grep/README.md new file mode 100644 index 000000000..6a47fa483 --- /dev/null +++ b/application/apps/indexer/text_grep/README.md @@ -0,0 +1,110 @@ +# text_grep + +`text_grep` is a crate for searching text patterns within files using regular expressions. + +This crate provides functionality to search for multiple patterns within multiple files concurrently. +It supports both case-sensitive and case-insensitive search modes. + +## Examples + +```rust +use std::path::PathBuf; +use text_grep::count_occurrences; +use tokio_util::sync::CancellationToken; + +#[tokio::main] +async fn main() { + // Patterns to search for + let patterns = ["text", "administrator", "HTTP"]; + + // File paths to search within + let file_paths = [ + PathBuf::from("indexing_access_huge.log"), + PathBuf::from("indexing_access_huge.log"), + PathBuf::from("Cargo.toml"), + ]; + + // Create a cancellation token + let cancel_token = CancellationToken::new(); + + // Perform the search + match count_occurrences( + &patterns, + &file_paths.iter().collect::>(), + true, + cancel_token, + ) + .await + { + Ok(results) => { + for result in results { + match result { + Ok(search_result) => { + // Process successful search result + println!("{:?}", search_result); + } + Err(err) => { + // Handle error + eprintln!("Error: {}", err); + } + } + } + } + Err(err) => { + // Handle error + eprintln!("Error: {}", err); + } + } +} +``` + +## Public Functions + +- `count_occurrences`: Asynchronously searches for multiple patterns within multiple files. + - Parameters: + - `patterns`: An array of string slices representing patterns to search for. + - `file_paths`: An array of `PathBuf` representing paths to files to search within. + - `case_sensitive`: A boolean indicating whether the search should be case-sensitive or not. + - `cancel_token`: A `CancellationToken` used for cancellation of the operation. + - Returns: + - `Result>, GrepError>`: A vector of results containing either `SearchResult` or `GrepError`. + +## Error Handling + +- `GrepError` represents various errors that may occur during the search process. + - `NotATextFile`: Indicates that a file is not a text file. + - `FileReadError`: Indicates an error occurred while reading a file. + - `FileProcessingError`: Indicates an error occurred while processing a file. + - `OperationCancelled`: Indicates that the operation was cancelled. + - `BuilingRegExError`: Indicates an error occurred while building a regular expression for searching. + - `RegExError`: Indicates an error occurred with a regular expression. + - `IOError`: Indicates an I/O error occurred. + +## Types + +- `SearchResult`: Represents the result of searching within a file. + - `file_path`: A `String` representing the path of the file. + - `pattern_counts`: A `HashMap` containing the counts of occurrences of each pattern within the file. + - `error_message`: An optional `String` containing an error message if any error occurred during the search. + +## Modules + +- `buffer`: Module for handling buffered I/O. +- `GrepError`: Module defining custom error types for the crate. + +## Dependencies + +- `buf_redux`: Provides buffered I/O functionality. +- `grep_regex`: Facilitates regular expression searching. +- `grep_searcher`: Implements file searching capabilities. +- `regex`: Provides regular expression support. +- `thiserror`: Simplifies error handling. + +## Additional Notes + +- This crate assumes that all files are text files. +- It utilizes asynchronous operations for efficiency, particularly in handling large files. +- Cancellation of ongoing operations is supported using a `CancellationToken`. +- Errors are handled using the `GrepError` enum, providing detailed error information. +- Regular expressions for searching are constructed dynamically based on user-provided patterns. +- Both case-sensitive and case-insensitive searches are supported based on user preference. diff --git a/application/apps/text_grep/src/buffer.rs b/application/apps/indexer/text_grep/src/buffer.rs similarity index 99% rename from application/apps/text_grep/src/buffer.rs rename to application/apps/indexer/text_grep/src/buffer.rs index 8352be1a0..e8930aca5 100644 --- a/application/apps/text_grep/src/buffer.rs +++ b/application/apps/indexer/text_grep/src/buffer.rs @@ -36,4 +36,4 @@ impl ReaderPolicy for CancallableMinBuffered { DoRead(true) } -} \ No newline at end of file +} diff --git a/application/apps/indexer/text_grep/src/lib.rs b/application/apps/indexer/text_grep/src/lib.rs new file mode 100644 index 000000000..68d9a0969 --- /dev/null +++ b/application/apps/indexer/text_grep/src/lib.rs @@ -0,0 +1,268 @@ +//! `text_grep` is a crate for searching text patterns within files using regular expressions. +//! +//! This crate provides functionality to search for multiple patterns within multiple files concurrently. +//! It supports both case-sensitive and case-insensitive search modes. +//! +//! # Examples +//! +//! ```rust +//! use std::path::PathBuf; +//! use text_grep::count_occurrences; +//! use tokio_util::sync::CancellationToken; +//! +//! #[tokio::main] +//! async fn main() { +//! // Patterns to search for +//! let patterns = ["text", "administrator", "HTTP"]; +//! +//! // File paths to search within +//! let file_paths = [ +//! PathBuf::from("indexing_access_huge.log"), +//! PathBuf::from("indexing_access_huge.log"), +//! PathBuf::from("Cargo.toml"), +//! ]; +//! +//! // Create a cancellation token +//! let cancel_token = CancellationToken::new(); +//! +//! // Perform the search +//! match count_occurrences( +//! &patterns, +//! &file_paths.iter().collect::>(), +//! true, +//! cancel_token, +//! ) +//! .await +//! { +//! Ok(results) => { +//! for result in results { +//! match result { +//! Ok(search_result) => { +//! // Process successful search result +//! println!("{:?}", search_result); +//! } +//! Err(err) => { +//! // Handle error +//! eprintln!("Error: {}", err); +//! } +//! } +//! } +//! } +//! Err(err) => { +//! // Handle error +//! eprintln!("Error: {}", err); +//! } +//! } +//! } +//! ``` +//! +//! # Public Functions +//! +//! - `count_occurrences`: Asynchronously searches for multiple patterns within multiple files. +//! - Parameters: +//! - `patterns`: An array of string slices representing patterns to search for. +//! - `file_paths`: An array of `PathBuf` representing paths to files to search within. +//! - `case_sensitive`: A boolean indicating whether the search should be case-sensitive or not. +//! - `cancel_token`: A `CancellationToken` used for cancellation of the operation. +//! - Returns: +//! - `Result>, GrepError>`: A vector of results containing either `SearchResult` or `GrepError`. +//! +//! # Error Handling +//! +//! - `GrepError` represents various errors that may occur during the search process. +//! - `NotATextFile`: Indicates that a file is not a text file. +//! - `FileReadError`: Indicates an error occurred while reading a file. +//! - `FileProcessingError`: Indicates an error occurred while processing a file. +//! - `OperationCancelled`: Indicates that the operation was cancelled. +//! - `BuilingRegExError`: Indicates an error occurred while building a regular expression for searching. +//! - `RegExError`: Indicates an error occurred with a regular expression. +//! - `IOError`: Indicates an I/O error occurred. +//! +//! # Types +//! +//! - `SearchResult`: Represents the result of searching within a file. +//! - `file_path`: A `String` representing the path of the file. +//! - `pattern_counts`: A `HashMap` containing the counts of occurrences of each pattern within the file. +//! - `error_message`: An optional `String` containing an error message if any error occurred during the search. +//! +//! # Modules +//! +//! - `buffer`: Module for handling buffered I/O. +//! - `GrepError`: Module defining custom error types for the crate. +//! +//! # Dependencies +//! +//! - `buf_redux`: Provides buffered I/O functionality. +//! - `grep_regex`: Facilitates regular expression searching. +//! - `grep_searcher`: Implements file searching capabilities. +//! - `regex`: Provides regular expression support. +//! - `thiserror`: Simplifies error handling. +//! +//! # Additional Notes +//! +//! - This crate assumes that all files are text files. +//! - It utilizes asynchronous operations for efficiency, particularly in handling large files. +//! - Cancellation of ongoing operations is supported using a `CancellationToken`. +//! - Errors are handled using the `GrepError` enum, providing detailed error information. +//! - Regular expressions for searching are constructed dynamically based on user-provided patterns. +//! - Both case-sensitive and case-insensitive searches are supported based on user preference. + +pub mod buffer; +use crate::buffer::{CancallableMinBuffered, REDUX_MIN_BUFFER_SPACE, REDUX_READER_CAPACITY}; +use buf_redux::BufReader; +use grep_regex::{RegexMatcher, RegexMatcherBuilder}; +use grep_searcher::{sinks::UTF8, Searcher}; +use regex::Regex; +use std::{ + collections::HashMap, + fs::File, + io, + path::{Path, PathBuf}, +}; +use thiserror::Error; +use tokio_util::sync::CancellationToken; + +#[derive(Debug, Error, Clone)] +pub enum GrepError { + #[error("File '{0}' is not a text file")] + NotATextFile(String), + #[error("Error reading file: {0}")] + FileReadError(String), + #[error("Error processing file: {0}")] + FileProcessingError(String), + #[error("Operation cancelled")] + OperationCancelled, + #[error("Error building regex: {0}")] + BuilingRegExError(grep_regex::Error), + #[error("Error building regex: {0}")] + RegExError(regex::Error), + #[error("IO error: {0}")] + IOError(String), +} + +impl From for GrepError { + fn from(e: grep_regex::Error) -> Self { + Self::BuilingRegExError(e) + } +} + +impl From for GrepError { + fn from(e: regex::Error) -> Self { + Self::RegExError(e) + } +} + +impl From for GrepError { + fn from(e: io::Error) -> Self { + Self::IOError(e.to_string()) + } +} + +#[derive(Debug, Clone)] +pub struct SearchResult { + pub file_path: String, + pub pattern_counts: HashMap, + pub error_message: Option, +} + +fn get_matcher(patterns: &[&str], case_sensitive: bool) -> Result { + Ok(RegexMatcherBuilder::new() + .case_insensitive(!case_sensitive) + .build( + &patterns + .iter() + .map(|pattern| regex::escape(pattern)) + .collect::>() + .join("|"), + )?) +} + +fn get_patterns_as_regs(patterns: &[&str], case_sensitive: bool) -> Result, GrepError> { + let mut regs: Vec = Vec::new(); + for pattern in patterns.iter() { + let regex_builder = if case_sensitive { + Regex::new(pattern)? + } else { + Regex::new(&format!("(?i){}", regex::escape(pattern)))? + }; + regs.push(regex_builder); + } + Ok(regs) +} + +fn process_file( + file_path: &PathBuf, + matcher: &RegexMatcher, + patterns: &[Regex], + cancel_token: &CancellationToken, +) -> Result { + if !is_text_file(file_path) { + return Ok(SearchResult { + file_path: file_path.to_string_lossy().into_owned(), + pattern_counts: HashMap::new(), + error_message: Some(format!("File '{}' is not a text file", file_path.display())), + }); + } + let mut pattern_counts = HashMap::new(); + let file = File::open(file_path)?; + let reader = BufReader::with_capacity(REDUX_READER_CAPACITY, file).set_policy( + CancallableMinBuffered((REDUX_MIN_BUFFER_SPACE, cancel_token.clone())), + ); + let mut searcher = Searcher::new(); + searcher + .search_reader( + matcher, + reader, + UTF8(|_, line| { + for pattern in patterns { + let count_entry = pattern_counts.entry((*pattern).to_string()).or_insert(0); + *count_entry += pattern.captures_iter(line).count(); + } + Ok(true) + }), + ) + .map_err(|e| GrepError::FileProcessingError(format!("Error processing file: {}", e)))?; + + // Alter pattern_counts to have original patterns without (?i) + let mut altered_pattern_counts = HashMap::new(); + for (pattern, count) in &pattern_counts { + let original_pattern = unset_case_insensitivity_flag(pattern); + altered_pattern_counts.insert(original_pattern, *count); + } + + Ok(SearchResult { + file_path: file_path.to_string_lossy().into_owned(), + pattern_counts: altered_pattern_counts, + error_message: None, + }) +} + +fn unset_case_insensitivity_flag(pattern: &str) -> String { + if pattern.starts_with("(?i)") { + pattern.chars().skip(4).collect() + } else { + pattern.to_string() + } +} + +fn is_text_file(_file_path: &Path) -> bool { + true +} + +pub async fn count_occurrences( + patterns: &[&str], + file_paths: &[&PathBuf], + case_sensitive: bool, + cancel_token: CancellationToken, +) -> Result>, GrepError> { + let mut results = Vec::new(); + let matcher = get_matcher(patterns, case_sensitive)?; + let regs = get_patterns_as_regs(patterns, case_sensitive)?; + for file_path in file_paths { + if cancel_token.is_cancelled() { + return Err(GrepError::OperationCancelled); // Return early if cancellation requested + } + results.push(process_file(file_path, &matcher, ®s, &cancel_token)); + } + Ok(results) +} diff --git a/application/apps/indexer/text_grep/tests/grep_tests.rs b/application/apps/indexer/text_grep/tests/grep_tests.rs new file mode 100644 index 000000000..bf0b2e224 --- /dev/null +++ b/application/apps/indexer/text_grep/tests/grep_tests.rs @@ -0,0 +1,125 @@ +#[cfg(test)] +mod tests { + use std::fs::File; + use std::io::Write; + use std::path::PathBuf; + use tempfile::tempdir; + use text_grep::{count_occurrences, GrepError}; + use tokio_util::sync::CancellationToken; + + // Function to create a temporary test file with given content + fn create_temp_file(content: &str) -> (PathBuf, String) { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + let file_path = temp_dir.path().join("test_file.txt"); + let mut file = File::create(&file_path).expect("Failed to create temporary file"); + file.write_all(content.as_bytes()) + .expect("Failed to write to temporary file"); + ( + file_path, + temp_dir.into_path().to_string_lossy().to_string(), + ) + } + + #[tokio::test] + async fn test_positive_cases() { + let content = "This is a test file\n\ + with multiple lines\n\ + to testtest pattern matching"; + let (file_path, _) = create_temp_file(content); + + let patterns = vec!["test", "multiple", "pattern"]; + let cancel_token = CancellationToken::new(); + let result = count_occurrences(&patterns, &[&file_path], false, cancel_token.clone()).await; + + // Asserting the result + assert!(result.is_ok(), "Result is not Ok"); + let result = result.unwrap(); + assert_eq!(result.len(), 1); + let search_result = result.into_iter().next().unwrap(); + match search_result { + Ok(search_result) => { + assert_eq!(search_result.file_path, file_path.to_str().unwrap()); + assert_eq!(search_result.error_message, None); + assert_eq!(search_result.pattern_counts.get("test"), Some(&3)); + assert_eq!(search_result.pattern_counts.get("multiple"), Some(&1)); + assert_eq!(search_result.pattern_counts.get("pattern"), Some(&1)); + } + Err(err) => panic!("Error occurred: {:?}", err), + } + } + + #[tokio::test] + async fn test_negative_cases() { + let content = "This is a test file\n\ + with multiple lines\n\ + to test pattern matching"; + let (file_path, _) = create_temp_file(content); + + let patterns = vec!["nonexistent", "pattern"]; + let cancel_token = CancellationToken::new(); + let result = count_occurrences(&patterns, &[&file_path], false, cancel_token.clone()).await; + + // Asserting the result + assert!(result.is_ok(), "Result is not Ok"); + let result = result.unwrap(); + assert_eq!(result.len(), 1); + let search_result = result.into_iter().next().unwrap(); + match search_result { + Ok(search_result) => { + assert_eq!(search_result.file_path, file_path.to_str().unwrap()); + assert_eq!(search_result.error_message, None); + assert_eq!(search_result.pattern_counts.get("nonexistent"), Some(&0)); + assert_eq!(search_result.pattern_counts.get("pattern"), Some(&1)); + } + Err(err) => panic!("Error occurred: {:?}", err), + } + } + + #[tokio::test] + async fn test_case_insensitivity() { + let content = "This is a test file\n\ + with multiple lines\n\ + to test pattern matching"; + let (file_path, _) = create_temp_file(content); + + let patterns = vec!["TEST", "MULTIPLE", "PATTERN"]; + let cancel_token = CancellationToken::new(); + let result = count_occurrences(&patterns, &[&file_path], false, cancel_token.clone()).await; + + // Asserting the result + assert!(result.is_ok(), "Result is not Ok"); + let result = result.unwrap(); + assert_eq!(result.len(), 1); + let search_result = result.into_iter().next().unwrap(); + match search_result { + Ok(search_result) => { + assert_eq!(search_result.file_path, file_path.to_str().unwrap()); + assert_eq!(search_result.error_message, None); + assert_eq!(search_result.pattern_counts.get("TEST"), Some(&2)); + assert_eq!(search_result.pattern_counts.get("MULTIPLE"), Some(&1)); + assert_eq!(search_result.pattern_counts.get("PATTERN"), Some(&1)); + } + Err(err) => panic!("Error occurred: {:?}", err), + } + } + + #[tokio::test] + async fn test_cancellation() { + let content = "This is a test file\n\ + with multiple lines\n\ + to test pattern matching"; + let (file_path, _) = create_temp_file(content); + + let patterns = vec!["test", "multiple", "pattern"]; + let cancel_token = CancellationToken::new(); + cancel_token.cancel(); + + let result = count_occurrences(&patterns, &[&file_path], false, cancel_token.clone()).await; + + // Asserting the result + assert!( + matches!(result, Err(GrepError::OperationCancelled)), + "Result is not Err(OperationCancelled)" + ); + } +} diff --git a/application/apps/text_grep/src/grep_searcher_no_chunks.rs b/application/apps/text_grep/src/grep_searcher_no_chunks.rs deleted file mode 100644 index 005b4db8b..000000000 --- a/application/apps/text_grep/src/grep_searcher_no_chunks.rs +++ /dev/null @@ -1,196 +0,0 @@ -use buf_redux::BufReader; -use grep_regex::RegexMatcherBuilder; -use grep_searcher::{sinks::UTF8, Searcher}; -use rayon::prelude::*; -use std::collections::HashMap; -use std::fs::File; -use std::path::{Path, PathBuf}; -use std::sync::mpsc::{self, Sender}; -use thiserror::Error; -use tokio_util::sync::CancellationToken; - -#[derive(Debug, Error, Clone)] -pub enum GrepError { - #[error("File '{0}' is not a text file")] - NotATextFile(String), - #[error("Error reading file: {0}")] - FileReadError(String), - #[error("Error processing file: {0}")] - FileProcessingError(String), - #[error("Operation cancelled")] - OperationCancelled, -} - -#[derive(Debug, Clone)] -pub struct SearchResult { - pub file_path: String, - pub pattern_counts: HashMap, - pub pattern_locations: HashMap>, - pub error_message: Option, -} - -pub struct TextGrep; - -impl Default for TextGrep { - fn default() -> Self { - TextGrep::new() - } -} - -impl TextGrep { - pub fn new() -> Self { - TextGrep - } - - pub async fn count_occurrences( - &self, - patterns: Vec<&str>, - file_paths: Vec<&str>, - case_sensitive: bool, - cancel_token: CancellationToken, - ) -> Result>, GrepError> { - let mut results = Vec::new(); - - let (sender, receiver) = mpsc::channel(); - let (error_sender, error_receiver) = mpsc::channel(); - - let _thread_handles: Vec<_> = file_paths - .par_iter() - .map(|&file_path| { - if let Err(err) = process_file( - file_path, - &patterns, - case_sensitive, - &cancel_token.clone(), - &sender, - ) { - if error_sender.send(err.clone()).is_err() {} - } - }) - .collect(); - - while let Ok(err_msg) = error_receiver.try_recv() { - results.push(Some(SearchResult { - file_path: "".to_string(), - pattern_counts: HashMap::new(), - pattern_locations: HashMap::new(), - error_message: Some(err_msg.to_string()), - })); - } - - while let Ok(search_result) = receiver.try_recv() { - results.push(Some(search_result?)); - } - - if cancel_token.is_cancelled() { - return Ok(vec![None; 0]); - } - - Ok(results) - } -} - -fn process_file( - file_path: &str, - patterns: &[&str], - case_sensitive: bool, - cancel_token: &CancellationToken, - sender: &Sender>, -) -> Result<(), GrepError> { - let file_path = PathBuf::from(&file_path); - - if !is_text_file(&file_path) { - let error_msg = format!("File '{}' is not a text file", file_path.display()); - let _ = sender - .send(Err(GrepError::NotATextFile(error_msg.clone()))) - .is_err(); - return Ok(()); - } - - let start_time = std::time::Instant::now(); - - let mut matchers = HashMap::new(); - for pattern in patterns { - let pattern_string = pattern.to_string(); - let mut matcher_builder = RegexMatcherBuilder::new(); - - // Adjust case sensitivity based on the parameter - if !case_sensitive { - matcher_builder.case_insensitive(true); - } - - let matcher = matcher_builder - .build(&pattern_string) - .map_err(|e| GrepError::FileProcessingError(e.to_string()))?; - matchers.insert(pattern_string, matcher); - } - - let mut local_pattern_counts = HashMap::new(); - let mut local_pattern_locations = HashMap::new(); // New: Track pattern locations - - for (pattern, matcher) in &matchers { - if cancel_token.is_cancelled() { - return Err(GrepError::OperationCancelled); - } - let mut total_count = 0; - let mut locations = Vec::new(); - let mut searcher = Searcher::new(); - - // Reset reader position for each pattern search - let file = File::open(&file_path).map_err(|e| GrepError::FileReadError(e.to_string()))?; - let reader = BufReader::new(file); // BufReader::with_capacity(64*1024*1024, file); // buf_redux's BufReader - - searcher - .search_reader( - matcher, - reader, - UTF8(|line_index, line| { - // line_number += 1; - // Convert both line and pattern to lowercase (or uppercase) for case-insensitive matching - let line_to_match = if case_sensitive { - line.to_string() - } else { - line.to_lowercase() - }; - let pattern_to_match = if case_sensitive { - pattern.to_string() - } else { - pattern.to_lowercase() - }; - total_count += line_to_match.matches(&pattern_to_match).count(); - - // Track positions of pattern occurrences - for mat in line_to_match.match_indices(&pattern_to_match) { - locations.push((line_index, mat.0)); // Record the position of the match - } - Ok(true) - }), - ) - .map_err(|e| GrepError::FileProcessingError(e.to_string()))?; - - local_pattern_counts.insert(pattern.clone(), total_count); - local_pattern_locations.insert(pattern.clone(), locations); // Save pattern locations - } - - let end_time = start_time.elapsed(); - eprintln!("Time taken {:?}", end_time); - - sender - .send(Ok(SearchResult { - file_path: file_path.to_string_lossy().into_owned(), - pattern_counts: local_pattern_counts, - pattern_locations: local_pattern_locations, - error_message: None, - })) - .map_err(|_| { - GrepError::FileProcessingError( - "Error sending search result through channel".to_string(), - ) - })?; - - Ok(()) -} - -fn is_text_file(_file_path: &Path) -> bool { - true -} diff --git a/application/apps/text_grep/src/grep_searcher_with_chunks.rs b/application/apps/text_grep/src/grep_searcher_with_chunks.rs deleted file mode 100644 index ff818e415..000000000 --- a/application/apps/text_grep/src/grep_searcher_with_chunks.rs +++ /dev/null @@ -1,233 +0,0 @@ -use grep_regex::RegexMatcherBuilder; -use grep_searcher::{sinks::UTF8, Searcher}; -use rayon::prelude::*; -use std::collections::HashMap; -use std::fs::File; -use std::io::{self, Read}; -use std::path::{Path, PathBuf}; -use std::sync::mpsc::{self, Sender}; -use thiserror::Error; -use tokio_util::sync::CancellationToken; - -#[derive(Debug, Error, Clone)] -pub enum GrepError { - #[error("File '{0}' is not a text file")] - NotATextFile(String), - #[error("Error reading file: {0}")] - FileReadError(String), - #[error("Error processing file: {0}")] - FileProcessingError(String), - #[error("Operation cancelled")] - OperationCancelled, -} - -#[derive(Debug, Clone)] -pub struct SearchResult { - pub file_path: String, - pub pattern_counts: HashMap, - pub error_message: Option, -} - -pub struct TextGrep; - -impl Default for TextGrep { - fn default() -> Self { - TextGrep::new() - } -} - -impl TextGrep { - pub fn new() -> Self { - TextGrep - } - - pub async fn count_occurrences( - &self, - patterns: Vec<&str>, - file_paths: Vec<&str>, - chunk_size: usize, - case_sensitive: bool, - cancel_token: CancellationToken, - ) -> Result>, GrepError> { - let mut results = Vec::new(); - - let (sender, receiver) = mpsc::channel(); - let (error_sender, error_receiver) = mpsc::channel(); - - let _thread_handles: Vec<_> = file_paths - .par_iter() - .map(|&file_path| { - if let Err(err) = process_file( - file_path, - &patterns, - chunk_size, - case_sensitive, - &cancel_token.clone(), - &sender, - ) { - if error_sender.send(err.clone()).is_err() {} - } - }) - .collect(); - - while let Ok(err_msg) = error_receiver.try_recv() { - results.push(Some(SearchResult { - file_path: "".to_string(), - pattern_counts: HashMap::new(), - error_message: Some(err_msg.to_string()), - })); - } - - while let Ok(search_result) = receiver.try_recv() { - results.push(Some(search_result?)); - } - - if cancel_token.is_cancelled() { - return Ok(vec![None; 0]); - } - - Ok(results) - } -} - -fn is_text_file(_file_path: &Path) -> bool { - true -} - -fn process_file( - file_path: &str, - patterns: &[&str], - chunk_size: usize, - case_sensitive: bool, - cancel_token: &CancellationToken, - sender: &Sender>, -) -> Result<(), GrepError> { - let file_path = PathBuf::from(&file_path); - - if !is_text_file(&file_path) { - let error_msg = format!("File '{}' is not a text file", file_path.display()); - let _ = sender - .send(Err(GrepError::NotATextFile(error_msg.clone()))) - .is_err(); - return Ok(()); - } - - let start_time = std::time::Instant::now(); - let pattern_counts = HashMap::new(); - - let mut file = - std::fs::File::open(&file_path).map_err(|e| GrepError::FileReadError(e.to_string()))?; - let mut buffer = vec![0; chunk_size]; // define a buffer to read chunks of data - - // create matchers for each pattern and store them with their corresponding patterns in a hashmap - let mut matchers = HashMap::new(); - - for pattern in patterns { - let pattern_string = pattern.to_string(); - let mut matcher_builder = RegexMatcherBuilder::new(); - - // Adjust case sensitivity based on the parameter - if !case_sensitive { - matcher_builder.case_insensitive(true); - } - - let matcher = matcher_builder - .build(&pattern_string) - .map_err(|e| GrepError::FileProcessingError(e.to_string()))?; - matchers.insert(pattern_string, matcher); - } - - let mut pattern_counts_clone = pattern_counts.clone(); - - loop { - if cancel_token.is_cancelled() { - return Err(GrepError::OperationCancelled); - } - - let bytes_read = read_file_until_newline(&mut file, &mut buffer, chunk_size) - .map_err(|e| GrepError::FileReadError(e.to_string()))?; - if bytes_read == 0 { - break; // Reached EOF - } - - let mut local_pattern_counts = HashMap::new(); - - for (pattern, matcher) in &matchers { - let mut total_count = 0; - - let mut searcher = Searcher::new(); - searcher - .search_reader( - matcher, - &buffer[..bytes_read], - UTF8(|_, line| { - // Convert both line and pattern to lowercase (or uppercase) for case-insensitive matching - let line_to_match = if case_sensitive { - line.to_string() - } else { - line.to_lowercase() - }; - let pattern_to_match = if case_sensitive { - pattern.to_string() - } else { - pattern.to_lowercase() - }; - total_count += line_to_match.matches(&pattern_to_match).count(); - Ok(true) - }), - ) - .map_err(|e| GrepError::FileProcessingError(e.to_string()))?; - - local_pattern_counts.insert(pattern.clone(), total_count); - } - - for (pattern, count) in local_pattern_counts { - *pattern_counts_clone.entry(pattern).or_insert(0) += count; - } - } - - let end_time = start_time.elapsed(); - eprintln!("Time taken {:?}", end_time); - - sender - .send(Ok(SearchResult { - file_path: file_path.to_string_lossy().into_owned(), - pattern_counts: pattern_counts_clone, - error_message: None, - })) - .map_err(|_| { - GrepError::FileProcessingError( - "Error sending search result through channel".to_string(), - ) - })?; - - Ok(()) -} - -fn read_file_until_newline( - file: &mut File, - buffer: &mut Vec, - chunk_size: usize, -) -> io::Result { - let mut bytes_read = 0; - - loop { - // Read a chunk of data into the buffer - let chunk = file.read(&mut buffer[bytes_read..bytes_read + chunk_size])?; - - // Update the total bytes read - bytes_read += chunk; - - // If we've reached the end of the file or encountered a newline, break - if chunk == 0 || buffer[bytes_read - 1] == b'\n' { - break; - } - - // If buffer is full, extend it to accommodate more data - if bytes_read + chunk_size > buffer.len() { - buffer.resize(bytes_read + chunk_size, 0); - } - } - - Ok(bytes_read) -} \ No newline at end of file diff --git a/application/apps/text_grep/src/lib.rs b/application/apps/text_grep/src/lib.rs deleted file mode 100644 index ae4d7448e..000000000 --- a/application/apps/text_grep/src/lib.rs +++ /dev/null @@ -1,164 +0,0 @@ -pub mod buffer; -use crate::buffer::{CancallableMinBuffered, REDUX_MIN_BUFFER_SPACE, REDUX_READER_CAPACITY}; -use buf_redux::BufReader; -use grep_regex::RegexMatcherBuilder; -use grep_searcher::{sinks::UTF8, Searcher}; -use std::collections::HashMap; -use std::fs::File; -use std::path::{Path, PathBuf}; -use thiserror::Error; -use tokio_util::sync::CancellationToken; - -#[derive(Debug, Error, Clone)] -pub enum GrepError { - #[error("File '{0}' is not a text file")] - NotATextFile(String), - #[error("Error reading file: {0}")] - FileReadError(String), - #[error("Error processing file: {0}")] - FileProcessingError(String), - #[error("Operation cancelled")] - OperationCancelled, -} - -#[derive(Debug, Clone)] -pub struct SearchResult { - pub file_path: String, - pub pattern_counts: HashMap, - pub error_message: Option, -} - -pub struct TextGrep; - -impl Default for TextGrep { - fn default() -> Self { - TextGrep::new() - } -} - -impl TextGrep { - pub fn new() -> Self { - TextGrep - } - - pub async fn count_occurrences( - &self, - patterns: Vec<&str>, - file_paths: Vec<&str>, - case_sensitive: bool, - cancel_token: CancellationToken, - ) -> Result>, GrepError> { - let mut results = Vec::new(); - - for file_path in file_paths { - if cancel_token.is_cancelled() { - return Err(GrepError::OperationCancelled); // Return early if cancellation requested - } - match process_file(file_path, &patterns, case_sensitive, &cancel_token) { - Ok(result) => results.push(Ok(result)), - Err(err) => results.push(Err(err)), - } - } - - Ok(results) - } -} - -fn process_file( - file_path: &str, - patterns: &[&str], - case_sensitive: bool, - cancel_token: &CancellationToken, -) -> Result { - let file_path = PathBuf::from(&file_path); - - if !is_text_file(&file_path) { - let error_msg = format!("File '{}' is not a text file", file_path.display()); - return Ok(SearchResult { - file_path: file_path.to_string_lossy().into_owned(), - pattern_counts: HashMap::new(), - error_message: Some(error_msg), - }); - } - - let start_time = std::time::Instant::now(); - - let combined_regex_pattern = patterns.join("|"); - let combined_regex = match RegexMatcherBuilder::new().build(&combined_regex_pattern) { - Ok(regex) => regex, - Err(e) => { - return Err(GrepError::FileProcessingError(format!( - "Error building regex: {}", - e - ))); - } - }; - - let mut local_pattern_counts = HashMap::new(); - - match File::open(&file_path) { - Ok(file) => { - let reader = BufReader::with_capacity(REDUX_READER_CAPACITY, file).set_policy( - CancallableMinBuffered((REDUX_MIN_BUFFER_SPACE, cancel_token.clone())), - ); - - let mut searcher = Searcher::new(); - if let Err(e) = searcher.search_reader( - &combined_regex, - reader, - UTF8(|_, line| { - let line_to_match = if case_sensitive { - line.to_string() - } else { - line.to_lowercase() - }; - - for pattern in patterns { - if cancel_token.is_cancelled() { - return Ok(false); - } - - let pattern_to_match = if case_sensitive { - pattern.to_string() - } else { - pattern.to_lowercase() - }; - - let mut total_count = 0; - total_count += line_to_match.matches(&pattern_to_match).count(); - - let count_entry = local_pattern_counts - .entry((*pattern).to_string()) - .or_insert(0); - *count_entry += total_count; - } - Ok(true) - }), - ) { - return Err(GrepError::FileProcessingError(format!( - "Error processing file: {}", - e - ))); - } - } - Err(e) => { - return Err(GrepError::FileReadError(format!( - "Error reading file: {}", - e - ))); - } - } - - let end_time = start_time.elapsed(); - eprintln!("Time taken {:?}", end_time); - - Ok(SearchResult { - file_path: file_path.to_string_lossy().into_owned(), - pattern_counts: local_pattern_counts, - error_message: None, - }) -} - -fn is_text_file(_file_path: &Path) -> bool { - true -} diff --git a/application/apps/text_grep/src/wc_impl2_draft.rs b/application/apps/text_grep/src/wc_impl2_draft.rs deleted file mode 100644 index cbe7ceeed..000000000 --- a/application/apps/text_grep/src/wc_impl2_draft.rs +++ /dev/null @@ -1,230 +0,0 @@ -/* -This is just a sample implementation and needs a fair bit of changes. -Adding it here only for an overview. -Still need to remove threads, mutex, arc, etc. -This is just a draft proof of concept and has enough room for improvement -and requires implementation of buf_redux. -This is mainly a proof of concept for read_file_until_newline to stop splitting of -pattern across chunks and calculate count of all occurrences of a pattern instead of -just the count of lines where pattern is found. -*/ - -use grep_regex::RegexMatcher; -use grep_searcher::sinks::UTF8; -use grep_searcher::Searcher; -use rayon::prelude::*; -use std::collections::HashMap; -use std::path::{Path, PathBuf}; -use std::sync::mpsc::Sender; -use std::sync::{mpsc, Arc, Mutex}; -use std::thread; -use std::time::Instant; -use tokio_util::sync::CancellationToken; -use std::fs::File; -use std::io::{self, BufReader, Read}; - -#[derive(Debug)] -pub struct SearchResult { - pub file_path: String, - pub pattern_counts: HashMap, - pub error_message: Option, -} - -pub struct TextGrep; - -impl Default for TextGrep { - fn default() -> Self { - TextGrep::new() - } -} - -impl TextGrep { - pub fn new() -> Self { - TextGrep - } - - pub async fn count_occurrences( - &self, - patterns: Vec<&str>, - file_paths: Vec<&str>, - chunk_size: usize, - case_sensitive: bool, - cancel_token: CancellationToken, - ) -> Result, String> { - let mut results = Vec::new(); - let cancel_token_clone = cancel_token.clone(); - - let (sender, receiver) = mpsc::channel(); - let (error_sender, error_receiver) = mpsc::channel(); - - let patterns_arc: Vec<_> = patterns.iter().map(|&p| Arc::from(p)).collect(); - let file_paths_arc: Vec<_> = file_paths.iter().map(|&fp| Arc::from(fp)).collect(); - - let thread_handles: Vec<_> = file_paths_arc - .par_iter() - .map(|file_path| { - let patterns = patterns_arc.clone(); - let sender = sender.clone(); - let error_sender_clone = error_sender.clone(); - let cancel_token = cancel_token_clone.clone(); - let file_path = Arc::clone(file_path); - if let Err(err) = - process_file(&file_path, &patterns, chunk_size, case_sensitive, &cancel_token, &sender) - { - if error_sender_clone.send(err.to_string()).is_err() { - eprintln!("Error sending error message through channel"); - } - } - }) - .collect(); - - // for handle in thread_handles { - // handle.join().unwrap(); - // } - - while let Ok(err_msg) = error_receiver.try_recv() { - eprintln!("Error processing file: {:?}", err_msg); - results.push(SearchResult { - file_path: "".to_string(), - pattern_counts: HashMap::new(), - error_message: Some(err_msg.to_string()), - }); - } - - while let Ok(search_result) = receiver.try_recv() { - results.push(search_result?); - } - - Ok(results) - } -} - -fn process_file( - file_path: &Arc, - patterns: &[Arc], - chunk_size: usize, - case_sensitive: bool, - cancel_token: &CancellationToken, - sender: &Sender>, -) -> Result<(), String> { - let file_path = PathBuf::from(&**file_path); - - if !is_text_file(&file_path) { - let error_msg = format!("File '{}' is not a text file", file_path.display()); - if sender.send(Err(error_msg.clone())).is_err() { - eprintln!("Error sending search result through channel"); - } - return Ok(()); - } - - let start_time = Instant::now(); - let pattern_counts = HashMap::new(); - - let mut file = std::fs::File::open(&file_path).map_err(|e| e.to_string())?; - let mut buffer = vec![0; chunk_size]; // define a buffer to read chunks of data - - // create matchers for each pattern and store them with their corresponding patterns in a hashmap - let mut matchers = HashMap::new(); - for pattern in patterns { - let pattern_string = pattern.as_ref().to_string(); - let matcher = RegexMatcher::new(&pattern_string).map_err(|e| e.to_string())?; - matchers.insert(pattern_string, matcher); - } - - let pattern_counts_mutex = Arc::new(Mutex::new(pattern_counts)); - - let mut threads = vec![]; - - loop { - if cancel_token.is_cancelled() { - return Err("Operation cancelled".to_string()); - } - - let bytes_read = read_file_until_newline(&mut file, &mut buffer, chunk_size).map_err(|e| e.to_string())?; - if bytes_read == 0 { - break; // Reached EOF - } - - let matchers_clone = matchers.clone(); - let pattern_counts_mutex_clone = pattern_counts_mutex.clone(); - let buffer_clone = buffer.clone(); - - let thread_handle = thread::spawn(move || { - let mut local_pattern_counts = HashMap::new(); - - for (pattern, matcher) in &matchers_clone { - let mut total_count = 0; - let mut searcher = Searcher::new(); - - searcher - .search_reader( - matcher, - &buffer_clone[..bytes_read], - UTF8(|_, line| { - total_count += line.matches(pattern).count(); - Ok(true) - }), - ) - .map_err(|e| e.to_string()) - .unwrap(); - - local_pattern_counts.insert(pattern.clone(), total_count); - } - - let mut pattern_counts_mutex_guard = pattern_counts_mutex_clone.lock().unwrap(); - for (pattern, count) in local_pattern_counts { - *pattern_counts_mutex_guard.entry(pattern).or_insert(0) += count; - } - }); - - threads.push(thread_handle); - } - - for thread in threads { - thread.join().unwrap(); - } - - let end_time = start_time.elapsed(); - eprintln!("Time taken {:?}", end_time); - - let pattern_counts_mutex_guard = pattern_counts_mutex.lock().unwrap(); - let aggregated_pattern_counts = pattern_counts_mutex_guard.clone(); - - sender - .send(Ok(SearchResult { - file_path: file_path.to_string_lossy().into_owned(), - pattern_counts: aggregated_pattern_counts, - error_message: None, - })) - .map_err(|e| e.to_string())?; - - Ok(()) -} - -fn read_file_until_newline(file: &mut File, buffer: &mut Vec, chunk_size: usize) -> io::Result { - let mut bytes_read = 0; - - loop { - // Read a chunk of data into the buffer - let chunk = file.read(&mut buffer[bytes_read..bytes_read + chunk_size])?; - - // Update the total bytes read - bytes_read += chunk; - - // If we've reached the end of the file or encountered a newline, break - if chunk == 0 || buffer[bytes_read - 1] == b'\n' { - break; - } - - // If buffer is full, extend it to accommodate more data - if bytes_read + chunk_size > buffer.len() { - buffer.resize(bytes_read + chunk_size, 0); - } - } - - Ok(bytes_read) -} - -fn is_text_file(_file_path: &Path) -> bool { - true -} diff --git a/application/apps/text_grep/tests/grep_tests.rs b/application/apps/text_grep/tests/grep_tests.rs deleted file mode 100644 index f536f56c2..000000000 --- a/application/apps/text_grep/tests/grep_tests.rs +++ /dev/null @@ -1,106 +0,0 @@ -#[cfg(test)] -mod tests { - use std::fs::File; - use std::io::Write; - use std::path::PathBuf; - use tempfile::tempdir; - use text_grep::GrepError; - use text_grep::TextGrep; - use tokio; - use tokio::runtime::Runtime; - use tokio_util::sync::CancellationToken; - - // function to create a temporary test file with given content - fn create_temp_file(content: &str) -> (PathBuf, String) { - let temp_dir = tempdir().expect("Failed to create temporary directory"); - let file_path = temp_dir.path().join("test_file.txt"); - let mut file = File::create(&file_path).expect("Failed to create temporary file"); - file.write_all(content.as_bytes()) - .expect("Failed to write to temporary file"); - ( - file_path, - temp_dir.into_path().to_string_lossy().to_string(), - ) - } - - #[test] - fn test_positive_cases() { - let content = "This is a test file\n\ - with multiple lines\n\ - to testtest pattern matching"; - let (file_path, _) = create_temp_file(content); - - let patterns = vec!["test", "multiple", "pattern"]; - let grep = TextGrep::new(); - let runtime = Runtime::new().unwrap(); - let result = runtime.block_on(grep.count_occurrences( - patterns.iter().map(|&p| p).collect(), - vec![file_path.to_str().unwrap()], - false, - CancellationToken::new(), - )); - - // Asserting the result - assert!(result.is_ok()); - let result = result.unwrap(); - assert_eq!(result.len(), 1); - let search_result = result.into_iter().next().unwrap().unwrap(); - assert_eq!(search_result.file_path, file_path.to_str().unwrap()); - assert_eq!(search_result.error_message, None); - assert_eq!(search_result.pattern_counts.get("test"), Some(&3)); - assert_eq!(search_result.pattern_counts.get("multiple"), Some(&1)); - assert_eq!(search_result.pattern_counts.get("pattern"), Some(&1)); - } - - #[test] - fn test_negative_cases() { - let content = "This is a test file\n\ - with multiple lines\n\ - to test pattern matching"; - let (file_path, _) = create_temp_file(content); - - let patterns = vec!["nonexistent", "pattern"]; - let grep = TextGrep::new(); - let runtime = Runtime::new().unwrap(); - let result = runtime.block_on(grep.count_occurrences( - patterns.iter().map(|&p| p).collect(), - vec![file_path.to_str().unwrap()], - false, - CancellationToken::new(), - )); - - // Asserting the result - assert!(result.is_ok()); - let result = result.unwrap(); - assert_eq!(result.len(), 1); - let search_result = result.into_iter().next().unwrap().unwrap(); - assert_eq!(search_result.file_path, file_path.to_str().unwrap()); - assert_eq!(search_result.error_message, None); - assert_eq!(search_result.pattern_counts.get("nonexistent"), Some(&0)); - assert_eq!(search_result.pattern_counts.get("pattern"), Some(&1)); - } - - #[test] - fn test_cancellation() { - let content = "This is a test file\n\ - with multiple lines\n\ - to test pattern matching"; - let (file_path, _) = create_temp_file(content); - - let patterns = vec!["test", "multiple", "pattern"]; - let grep = TextGrep::new(); - let runtime = Runtime::new().unwrap(); - let cancel_token = CancellationToken::new(); - cancel_token.cancel(); - - let result = runtime.block_on(grep.count_occurrences( - patterns.iter().map(|&p| p).collect(), - vec![file_path.to_str().unwrap()], - false, - cancel_token, - )); - - // Asserting the result - assert!(matches!(result, Err(GrepError::OperationCancelled))); - } -}