Skip to content

Commit 3f80beb

Browse files
more cross-checking
1 parent c2c81bb commit 3f80beb

File tree

5 files changed

+53
-8
lines changed

5 files changed

+53
-8
lines changed

original/divsufsort.c

+1
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ sort_typeBstar(const sauchar_t *T, saidx_t *SA,
182182
BSTAR_dump("post-rank");
183183

184184
/* Construct the inverse suffix array of type B* suffixes using trsort. */
185+
SA_dump(SA, "pre-tr");
185186
trsort(ISAb, SA, m, 1);
186187

187188
SA_dump(SA, "post-tr");

original/trsort.c

+31-6
Original file line numberDiff line numberDiff line change
@@ -690,23 +690,48 @@ trsort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth) {
690690
first = SA;
691691
skip = 0;
692692
unsorted = 0;
693+
// PETER
693694
do {
694-
if((t = *first) < 0) { first -= t; skip += t; }
695+
if ((t = *first) < 0) {
696+
crosscheck("t < 0");
697+
first -= t;
698+
skip += t;
699+
crosscheck("first=%d skip=%d", first-SA, skip);
700+
}
695701
else {
696-
if(skip != 0) { *(first + skip) = skip; skip = 0; }
702+
crosscheck("t >= 0");
703+
if (skip != 0) {
704+
crosscheck("SA[first=%d + skip=%d] = skip=%d", first-SA, skip, skip);
705+
*(first + skip) = skip;
706+
skip = 0;
707+
}
697708
last = SA + ISA[t] + 1;
709+
crosscheck("last=%d", last-SA);
698710
if(1 < (last - first)) {
711+
crosscheck("1<(last-first)");
699712
budget.count = 0;
713+
SA_dump(SA, "tr_introsort(A)")
714+
crosscheck("call tr_introsort ISA=%d ISAd=%d first=%d last=%d", ISA-SA, ISAd-SA, first-SA, last-SA);
700715
tr_introsort(ISA, ISAd, SA, first, last, &budget);
701-
if(budget.count != 0) { unsorted += budget.count; }
702-
else { skip = first - last; }
716+
SA_dump(SA, "tr_introsort(B)")
717+
if (budget.count != 0) {
718+
unsorted += budget.count;
719+
} else {
720+
skip = first - last;
721+
}
703722
} else if((last - first) == 1) {
704723
skip = -1;
705724
}
706725
first = last;
707726
}
708727
} while(first < (SA + n));
709-
if(skip != 0) { *(first + skip) = skip; }
710-
if(unsorted == 0) { break; }
728+
if(skip != 0) {
729+
crosscheck("skip != 0, trsort-end");
730+
*(first + skip) = skip;
731+
}
732+
if(unsorted == 0) {
733+
crosscheck("unsorted == 0, trsort-end");
734+
break;
735+
}
711736
}
712737
}

src/divsufsort.rs

+1
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ fn sort_typeBstar(T: &Text, SA: &mut SuffixArray) -> SortTypeBstarResult {
275275
BSTAR_dump(&mut B, "post-rank");
276276

277277
// Construct the inverse suffix array of type B* suffixes using trsort.
278+
SA_dump(SA, "pre-tr");
278279
trsort::trsort(ISAb, SA, m, 1);
279280

280281
SA_dump(SA, "post-tr");

src/trsort.rs

+18-2
Original file line numberDiff line numberDiff line change
@@ -1125,22 +1125,36 @@ pub fn trsort(ISA: SAPtr, SA: &mut SuffixArray, n: Idx, depth: Idx) {
11251125
skip = 0;
11261126
unsorted = 0;
11271127

1128-
// do..while
1128+
// PETER
11291129
loop {
1130-
// body for do..while
11311130
t = SA[first];
11321131
if (t < 0) {
1132+
crosscheck!("t < 0");
11331133
first -= t;
11341134
skip += t;
1135+
crosscheck!("first={} skip={}", first, skip);
11351136
} else {
1137+
crosscheck!("t >= 0");
11361138
if (skip != 0) {
1139+
crosscheck!("SA[first={} + skip={}] = skip={}", first, skip, skip);
11371140
SA[first + skip] = skip;
11381141
skip = 0;
11391142
}
11401143
last = SAPtr(SA[ISA + t] + 1);
1144+
crosscheck!("last={}", last);
11411145
if (1 < (last - first)) {
1146+
crosscheck!("1<(last-first)");
11421147
budget.count = 0;
1148+
SA_dump(&SA.range_to(..n), "tr_introsort(A)");
1149+
crosscheck!(
1150+
"call tr_introsort ISA={} ISAd={} first={} last={}",
1151+
ISA,
1152+
ISAd,
1153+
first,
1154+
last
1155+
);
11431156
tr_introsort(ISA, ISAd, SA, first, last, &mut budget);
1157+
SA_dump(&SA.range_to(..n), "tr_introsort(B)");
11441158
if (budget.count != 0) {
11451159
unsorted += budget.count;
11461160
} else {
@@ -1159,9 +1173,11 @@ pub fn trsort(ISA: SAPtr, SA: &mut SuffixArray, n: Idx, depth: Idx) {
11591173
}
11601174

11611175
if (skip != 0) {
1176+
crosscheck!("skip != 0, trsort-end");
11621177
SA[first + skip] = skip;
11631178
}
11641179
if (unsorted == 0) {
1180+
crosscheck!("unsorted == 0, trsort-end");
11651181
break;
11661182
}
11671183

testdata/input.txt

+2
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ A corpus may contain texts in a single language (monolingual corpus) or text dat
33
Multilingual corpora that have been specially formatted for side-by-side comparison are called aligned parallel corpora. There are two main types of parallel corpora which contain texts in two languages. In a translation corpus, the texts in one language are translations of texts in the other language. In a comparable corpus, the texts are of the same kind and cover the same content, but they are not translations of each other.[1] To exploit a parallel text, some kind of text alignment identifying equivalent text segments (phrases or sentences) is a prerequisite for analysis. Machine translation algorithms for translating between two languages are often trained using parallel fragments comprising a first language corpus and a second language corpus which is an element-for-element translation of the first language corpus.[2]
44

55
In order to make the corpora more useful for doing linguistic research, they are often subjected to a process known as annotation. An example of annotating a corpus is part-of-speech tagging, or POS-tagging, in which information about each word's part of speech (verb, noun, adjective, etc.) is added to the corpus in the form of tags. Another example is indicating the lemma (base) form of each word. When the language of the corpus is not a working language of the researchers who use it, interlinear glossing is used to make the annotation bilingual.
6+
7+
Some corpora have further structured levels of analysis applied. In particular, a number of smaller corpora may be fully parsed. Such corpora are usually called Treebanks or Parsed Corpora. The difficulty of ensuring that the entire corpus is completely and consistently annotated means that these corpora are usually smaller, containing around one to three million words.

0 commit comments

Comments
 (0)