Skip to content

Commit 3f258f2

Browse files
Reorganize crates, document a bunch of em
1 parent d855898 commit 3f258f2

21 files changed

+618
-275
lines changed

Cargo.lock

+54-162
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
[workspace]
22
members = [
3+
"crates/sacabase",
34
"crates/divsufsort",
45
"crates/cdivsufsort",
56
"crates/divsuftest",

README.md

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
2+
# suffixsearch
3+
4+
[![Build Status](https://travis-ci.org/fasterthanlime/suffixsearch.svg?branch=master)](https://travis-ci.org/fasterthanlime/suffixsearch)
5+
6+
A collection of SACAs (suffix array construction algorithms) and other
7+
methods of indexing and searching for substrings in all suffixes of a
8+
given input.
9+
10+
## Crates
11+
12+
* [divsufsort](crates/divsufsort) is Rust version of Yuta Mori's `libdivsufsort`, ported by hand
13+
* [cdivsfusort](crates/cdivsufsort) is Yuta Mori's original `libdivsufsort`, built with the `cc` crate
14+
* [divsuftest](crates/divsuftest) is a test executable that allows comparing against the
15+
above crates.
16+
* [dc3](crates/dc3) is a naive work-in-progress implementation of DC3 (Differential Cover, v=3)
17+
18+
See the crates' README files for more information on their status,
19+
expected performance and licensing.

crates/cdivsufsort/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ edition = "2018"
88
crosscheck = []
99

1010
[dependencies]
11+
sacabase = { path = "../sacabase" }
1112

1213
[build-dependencies]
1314
cc = "1.0.47"

crates/cdivsufsort/LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) 2019 Amos Wenger All rights reserved.
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

crates/cdivsufsort/README.md

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
2+
# cdivsufsort
3+
4+
This crate contains Yuta Mori's C codebase `libdivsufsort`, as found on:
5+
6+
* <https://github.com/y-256/libdivsufsort/tree/5f60d6f026c30fb4ac296f696b3c8b0eb71bd428>
7+
8+
...and a minimal Rust interface to it.
9+
10+
## Changes
11+
12+
There are no functional changes to the codebase, however:
13+
14+
* Parts of the code have been formatted with clang-format (LLVM style)
15+
* Many of the loops (for, do..while) have been given names in comments, for
16+
ease of translation.
17+
* The codebase contains "cross-checking" facilities (the macros `crosscheck`,
18+
`SA_dump`, etc.) so its behavior can be compared with the Rust port.
19+
20+
Cross-checking is only built when the `crosscheck` feature is enabled. It is
21+
not intended for general use, only for debugging the `divsufsort` crate.
22+
23+
## Further reading
24+
25+
The divsufsort algorithm is based on "",
26+
27+
## Authors
28+
29+
The original code was written by Yuta Mori, and its essence is not changed
30+
here.
31+
32+
## License
33+
34+
`cdivsufsort` is released under the MIT license, same as the original.
35+
36+
See the `LICENSE` and `c-sources/LICENSE` files for details.
37+

crates/cdivsufsort/c-sources/LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) 2003 Yuta Mori All rights reserved.
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

crates/cdivsufsort/src/lib.rs

+27-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,30 @@
11
extern "C" {
2-
pub fn divsufsort(T: *const u8, SA: *mut i32, n: i32) -> i32;
2+
fn divsufsort(T: *const u8, SA: *mut i32, n: i32) -> i32;
33
pub fn dss_flush();
44
}
5+
6+
/// Sort suffixes of `text` and store their lexographic order
7+
/// in the given suffix array `sa`.
8+
/// Will panic if `sa.len()` != `text.len()`
9+
pub fn sort_in_place(text: &[u8], sa: &mut [i32]) {
10+
assert_eq!(
11+
text.len(),
12+
sa.len(),
13+
"text and suffix array should have same len"
14+
);
15+
assert!(
16+
text.len() < i32::max_value() as usize,
17+
"text too large, should not exceed {} bytes",
18+
i32::max_value() - 1
19+
);
20+
21+
let ret = unsafe { divsufsort(text.as_ptr(), sa.as_mut_ptr(), text.len() as i32) };
22+
assert_eq!(0, ret);
23+
}
24+
25+
//// Sort suffixes
26+
pub fn sort<'a>(text: &'a [u8]) -> sacabase::SuffixArray<i32> {
27+
let mut sa = vec![0; text.len()];
28+
sort_in_place(text, &mut sa);
29+
sacabase::SuffixArray::new(text, sa)
30+
}

crates/dc3/README.md

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
# dc3
3+
4+
An incomplete, exploratory implementation of DC-3.

crates/divsufsort/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ crosscheck = ["once_cell"]
99

1010
[dependencies]
1111
once_cell = { version = "1.2.0", optional = true }
12+
sacabase = { path = "../sacabase" }

crates/divsufsort/LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) 2019 Amos Wenger All rights reserved.
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

crates/divsufsort/README.md

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
2+
# divsufsort
3+
4+
This crate contains a Rust handmade port of Yuta Mori's `libdivsufsort`, as found on:
5+
6+
* <https://github.com/y-256/libdivsufsort/tree/5f60d6f026c30fb4ac296f696b3c8b0eb71bd428>
7+
8+
## Changes
9+
10+
The main changes from the C codebase are as follows.
11+
12+
Instead of passing pointers to T (the original text) and SA (the suffix array),
13+
slices and indices are passed instead. This sometimes involves adding more parameters
14+
to functions (like `tr_heapsort`).
15+
16+
Some macros (for stacks, used in `sssort` and `trsort`) have been replaced with
17+
proper Rust types. The `SAPtr` type is used to represent an index into `SA`.
18+
A/B/B* access has also been translated from C macros to Rust (inlined) functions.
19+
20+
Cross-checking is only built when the `crosscheck` feature is enabled. It is
21+
not intended for general use, only for debugging the `divsufsort` crate.
22+
23+
## Authors
24+
25+
The original C code was written by Yuta Mori.
26+
27+
The port was done by hand, by [Amos Wenger](https://github.com/fasterthanlime).
28+
29+
## License
30+
31+
`divsufsort` is released under the MIT license, same as the original.
32+
33+
See the `LICENSE` file for details.
Binary file not shown.

crates/divsufsort/src/common.rs

-2
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@ pub const ALPHABET_SIZE: usize = u8::max_value() as usize + 1;
1313
pub const BUCKET_A_SIZE: usize = ALPHABET_SIZE;
1414
pub const BUCKET_B_SIZE: usize = ALPHABET_SIZE * ALPHABET_SIZE;
1515

16-
pub const MAX_INPUT_SIZE: usize = i32::max_value() as usize;
17-
1816
// Read-only input to suffix-sort
1917
pub struct Text<'a>(pub &'a [Char]);
2018

crates/divsufsort/src/divsufsort.rs

+10-10
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
use crate::{common::*, crosscheck, crosscheck::*, sssort, trsort};
22

33
pub fn divsufsort(T: &[Char], SA: &mut [Idx]) {
4-
if T.len() != SA.len() {
5-
panic!("divsufsort: T and SA arguments should have same length");
6-
}
4+
assert_eq!(
5+
T.len(),
6+
SA.len(),
7+
"text and suffix array should have same len"
8+
);
9+
assert!(
10+
T.len() < i32::max_value() as usize,
11+
"text too large, should not exceed {} bytes",
12+
i32::max_value() - 1
13+
);
714

815
let n = T.len();
9-
if n >= MAX_INPUT_SIZE {
10-
// This one ought to not be a panic, maybe?
11-
panic!(
12-
"divsufsort: input too large (max input size: {})",
13-
MAX_INPUT_SIZE
14-
)
15-
}
1616

1717
// short T cases
1818
match n {

crates/divsufsort/src/lib.rs

+17-5
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,22 @@ mod divsufsort;
1111
mod sssort;
1212
mod trsort;
1313

14-
pub use crate::divsufsort::divsufsort;
15-
pub use common::Idx;
14+
use common::Idx;
15+
use sacabase::SuffixArray;
16+
17+
/// Sort suffixes of `text` and store their lexographic order
18+
/// in the given suffix array `sa`.
19+
/// Will panic if `sa.len()` != `text.len()`
20+
pub fn sort_in_place(text: &[u8], sa: &mut [Idx]) {
21+
divsufsort::divsufsort(text, sa);
22+
}
23+
24+
//// Sort suffixes
25+
pub fn sort<'a>(text: &'a [u8]) -> sacabase::SuffixArray<Idx> {
26+
let mut sa = vec![0; text.len()];
27+
sort_in_place(text, &mut sa);
28+
sacabase::SuffixArray::new(text, sa)
29+
}
1630

1731
#[cfg(test)]
1832
mod tests {
@@ -30,8 +44,6 @@ mod tests {
3044
where
3145
T: AsRef<[u8]>,
3246
{
33-
let s = s.as_ref();
34-
let mut SA = vec![0; s.len()];
35-
super::divsufsort(s, &mut SA[..]);
47+
super::sort(s.as_ref());
3648
}
3749
}

crates/divsuftest/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ cdivsufsort = { path = "../cdivsufsort" }
1313
suffix_array = "0.4.0"
1414
better-panic = "0.2.0"
1515
size_format = "1.0.2"
16-
wavelet-matrix = { path = "./wavelet-matrix-rs" }
16+
cli-table = "0.2.0"

0 commit comments

Comments
 (0)