|
1 |
| -/* |
2 |
| - * Copyright (c) 2013 The Interedition Development Group. |
3 |
| - * |
4 |
| - * This file is part of CollateX. |
5 |
| - * |
6 |
| - * CollateX is free software: you can redistribute it and/or modify |
7 |
| - * it under the terms of the GNU General Public License as published by |
8 |
| - * the Free Software Foundation, either version 3 of the License, or |
9 |
| - * (at your option) any later version. |
10 |
| - * |
11 |
| - * CollateX is distributed in the hope that it will be useful, |
12 |
| - * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
| - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 |
| - * GNU General Public License for more details. |
15 |
| - * |
16 |
| - * You should have received a copy of the GNU General Public License |
17 |
| - * along with CollateX. If not, see <http://www.gnu.org/licenses/>. |
18 |
| - */ |
19 |
| - |
20 |
| -package eu.interedition.collatex; |
21 |
| - |
22 |
| -import eu.interedition.collatex.dekker.Match; |
23 |
| -import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm; |
24 |
| -import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschScorer; |
25 |
| -import eu.interedition.collatex.util.VertexMatch; |
26 |
| - |
27 |
| -import java.util.Arrays; |
28 |
| -import java.util.Collections; |
29 |
| -import java.util.Comparator; |
30 |
| -import java.util.HashMap; |
31 |
| -import java.util.HashSet; |
32 |
| -import java.util.List; |
33 |
| -import java.util.Map; |
34 |
| -import java.util.Set; |
35 |
| -import java.util.SortedSet; |
36 |
| -import java.util.TreeSet; |
37 |
| -import java.util.logging.Level; |
38 |
| -import java.util.logging.Logger; |
39 |
| -import java.util.stream.Collectors; |
40 |
| -import java.util.stream.StreamSupport; |
41 |
| - |
42 |
| -/** |
43 |
| - * @author <a href="http://gregor.middell.net/" title="Homepage">Gregor Middell</a> |
44 |
| - */ |
45 |
| -public interface CollationAlgorithm { |
46 |
| - |
47 |
| - void collate(VariantGraph against, Iterable<Token> witness); |
48 |
| - |
49 |
| - void collate(VariantGraph against, Iterable<Token>... witnesses); |
50 |
| - |
51 |
| - void collate(VariantGraph against, List<? extends Iterable<Token>> witnesses); |
52 |
| - |
53 |
| - abstract class Base implements CollationAlgorithm { |
54 |
| - protected final Logger LOG = Logger.getLogger(getClass().getName()); |
55 |
| - protected Map<Token, VariantGraph.Vertex> witnessTokenVertices; |
56 |
| - |
57 |
| - @Override |
58 |
| - public void collate(VariantGraph against, Iterable<Token>... witnesses) { |
59 |
| - collate(against, Arrays.asList(witnesses)); |
60 |
| - } |
61 |
| - |
62 |
| - @Override |
63 |
| - public void collate(VariantGraph against, List<? extends Iterable<Token>> witnesses) { |
64 |
| - for (Iterable<Token> witness : witnesses) { |
65 |
| - if (LOG.isLoggable(Level.FINE)) { |
66 |
| - LOG.log(Level.FINE, "heap space: {0}/{1}", new Object[] { |
67 |
| - Runtime.getRuntime().totalMemory(), |
68 |
| - Runtime.getRuntime().maxMemory() |
69 |
| - }); |
70 |
| - } |
71 |
| - collate(against, witness); |
72 |
| - } |
73 |
| - } |
74 |
| - |
75 |
| - protected void merge(VariantGraph into, Iterable<Token> witnessTokens, Map<Token, VariantGraph.Vertex> alignments) { |
76 |
| - final Witness witness = StreamSupport.stream(witnessTokens.spliterator(), false) |
77 |
| - .findFirst() |
78 |
| - .map(Token::getWitness) |
79 |
| - .orElseThrow(() -> new IllegalArgumentException("Empty witness")); |
80 |
| - |
81 |
| - if (LOG.isLoggable(Level.FINE)) { |
82 |
| - LOG.log(Level.FINE, "{0} + {1}: Merge comparand into graph", new Object[] { into, witness }); |
83 |
| - } |
84 |
| - witnessTokenVertices = new HashMap<>(); |
85 |
| - VariantGraph.Vertex last = into.getStart(); |
86 |
| - final Set<Witness> witnessSet = Collections.singleton(witness); |
87 |
| - for (Token token : witnessTokens) { |
88 |
| - VariantGraph.Vertex matchingVertex = alignments.get(token); |
89 |
| - if (matchingVertex == null) { |
90 |
| - matchingVertex = into.add(token); |
91 |
| - } else { |
92 |
| - if (LOG.isLoggable(Level.FINE)) { |
93 |
| - LOG.log(Level.FINE, "Match: {0} to {1}", new Object[] { matchingVertex, token }); |
94 |
| - } |
95 |
| - matchingVertex.add(Collections.singleton(token)); |
96 |
| - } |
97 |
| - witnessTokenVertices.put(token, matchingVertex); |
98 |
| - |
99 |
| - into.connect(last, matchingVertex, witnessSet); |
100 |
| - last = matchingVertex; |
101 |
| - } |
102 |
| - into.connect(last, into.getEnd(), witnessSet); |
103 |
| - } |
104 |
| - |
105 |
| - protected void mergeTranspositions(VariantGraph into, Iterable<SortedSet<VertexMatch.WithToken>> transpositions) { |
106 |
| - for (SortedSet<VertexMatch.WithToken> transposedPhrase : transpositions) { |
107 |
| - if (LOG.isLoggable(Level.FINE)) { |
108 |
| - LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase); |
109 |
| - } |
110 |
| - final Set<VariantGraph.Vertex> transposed = new HashSet<>(); |
111 |
| - for (VertexMatch.WithToken match : transposedPhrase) { |
112 |
| - transposed.add(witnessTokenVertices.get(match.token)); |
113 |
| - transposed.add(match.vertex); |
114 |
| - } |
115 |
| - into.transpose(transposed); |
116 |
| - } |
117 |
| - } |
118 |
| - |
119 |
| - protected void mergeTranspositions(VariantGraph into, List<List<Match>> transpositions) { |
120 |
| - for (List<Match> transposedPhrase : transpositions) { |
121 |
| - if (LOG.isLoggable(Level.FINE)) { |
122 |
| - LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase); |
123 |
| - } |
124 |
| - final Set<VariantGraph.Vertex> transposed = new HashSet<>(); |
125 |
| - for (Match match : transposedPhrase) { |
126 |
| - transposed.add(witnessTokenVertices.get(match.token)); |
127 |
| - transposed.add(match.vertex); |
128 |
| - } |
129 |
| - into.transpose(transposed); |
130 |
| - } |
131 |
| - } |
132 |
| - |
133 |
| - protected void merge(VariantGraph graph, VariantGraph.Vertex[][] vertices, Token[] tokens, SortedSet<SortedSet<VertexMatch.WithTokenIndex>> matches) { |
134 |
| - @SuppressWarnings("unchecked") |
135 |
| - final SortedSet<VertexMatch.WithTokenIndex>[] matchesVertexOrder = matches.toArray(new SortedSet[matches.size()]); |
136 |
| - final SortedSet<VertexMatch.WithTokenIndex>[] matchesTokenOrder = Arrays.copyOf(matchesVertexOrder, matchesVertexOrder.length); |
137 |
| - |
138 |
| - Arrays.sort(matchesTokenOrder, Comparator.comparing(m -> m.first().token)); |
139 |
| - |
140 |
| - final Set<SortedSet<VertexMatch.WithTokenIndex>> alignedMatches = NeedlemanWunschAlgorithm.align( |
141 |
| - matchesVertexOrder, |
142 |
| - matchesTokenOrder, |
143 |
| - new MatchPhraseAlignmentScorer(Math.max(tokens.length, vertices.length)) |
144 |
| - ).keySet(); |
145 |
| - |
146 |
| - final Map<Token, VariantGraph.Vertex> alignments = matches.stream() |
147 |
| - .filter(alignedMatches::contains) |
148 |
| - .flatMap(Set::stream) |
149 |
| - .collect(Collectors.toMap(m -> tokens[m.token], m -> m.vertex)); |
150 |
| - |
151 |
| - final List<SortedSet<VertexMatch.WithToken>> transpositions = matches.stream() |
152 |
| - .filter(m -> !alignedMatches.contains(m)) |
153 |
| - .map(t -> t.stream().map(m -> new VertexMatch.WithToken(m.vertex, m.vertexRank, tokens[m.token])).collect(Collectors.toCollection(TreeSet::new))) |
154 |
| - .collect(Collectors.toList()); |
155 |
| - |
156 |
| - merge(graph, Arrays.asList(tokens), alignments); |
157 |
| - mergeTranspositions(graph, transpositions); |
158 |
| - } |
159 |
| - } |
160 |
| - |
161 |
| - static class MatchPhraseAlignmentScorer implements NeedlemanWunschScorer<SortedSet<VertexMatch.WithTokenIndex>, SortedSet<VertexMatch.WithTokenIndex>> { |
162 |
| - |
163 |
| - private final int maxWitnessLength; |
164 |
| - |
165 |
| - public MatchPhraseAlignmentScorer(int maxWitnessLength) { |
166 |
| - this.maxWitnessLength = maxWitnessLength; |
167 |
| - } |
168 |
| - |
169 |
| - @Override |
170 |
| - public float score(SortedSet<VertexMatch.WithTokenIndex> a, SortedSet<VertexMatch.WithTokenIndex> b) { |
171 |
| - return (a.equals(b) ? 1 : -maxWitnessLength); |
172 |
| - } |
173 |
| - |
174 |
| - @Override |
175 |
| - public float gap() { |
176 |
| - return -(1 / (maxWitnessLength * 1.0f)); |
177 |
| - } |
178 |
| - |
179 |
| - } |
180 |
| -} |
| 1 | +/* |
| 2 | + * Copyright (c) 2013 The Interedition Development Group. |
| 3 | + * |
| 4 | + * This file is part of CollateX. |
| 5 | + * |
| 6 | + * CollateX is free software: you can redistribute it and/or modify |
| 7 | + * it under the terms of the GNU General Public License as published by |
| 8 | + * the Free Software Foundation, either version 3 of the License, or |
| 9 | + * (at your option) any later version. |
| 10 | + * |
| 11 | + * CollateX is distributed in the hope that it will be useful, |
| 12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | + * GNU General Public License for more details. |
| 15 | + * |
| 16 | + * You should have received a copy of the GNU General Public License |
| 17 | + * along with CollateX. If not, see <http://www.gnu.org/licenses/>. |
| 18 | + */ |
| 19 | + |
| 20 | +package eu.interedition.collatex; |
| 21 | + |
| 22 | +import eu.interedition.collatex.dekker.Match; |
| 23 | +import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm; |
| 24 | +import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschScorer; |
| 25 | +import eu.interedition.collatex.util.VertexMatch; |
| 26 | + |
| 27 | +import java.util.Arrays; |
| 28 | +import java.util.Collections; |
| 29 | +import java.util.Comparator; |
| 30 | +import java.util.HashMap; |
| 31 | +import java.util.HashSet; |
| 32 | +import java.util.List; |
| 33 | +import java.util.Map; |
| 34 | +import java.util.Set; |
| 35 | +import java.util.SortedSet; |
| 36 | +import java.util.TreeSet; |
| 37 | +import java.util.logging.Level; |
| 38 | +import java.util.logging.Logger; |
| 39 | +import java.util.stream.Collectors; |
| 40 | +import java.util.stream.StreamSupport; |
| 41 | + |
| 42 | +/** |
| 43 | + * @author <a href="http://gregor.middell.net/" title="Homepage">Gregor Middell</a> |
| 44 | + */ |
| 45 | +public interface CollationAlgorithm { |
| 46 | + |
| 47 | + void collate(VariantGraph against, Iterable<Token> witness); |
| 48 | + |
| 49 | + void collate(VariantGraph against, Iterable<Token>... witnesses); |
| 50 | + |
| 51 | + void collate(VariantGraph against, List<? extends Iterable<Token>> witnesses); |
| 52 | + |
| 53 | + abstract class Base implements CollationAlgorithm { |
| 54 | + protected final Logger LOG = Logger.getLogger(getClass().getName()); |
| 55 | + protected Map<Token, VariantGraph.Vertex> witnessTokenVertices; |
| 56 | + |
| 57 | + @Override |
| 58 | + public void collate(VariantGraph against, Iterable<Token>... witnesses) { |
| 59 | + collate(against, Arrays.asList(witnesses)); |
| 60 | + } |
| 61 | + |
| 62 | + @Override |
| 63 | + public void collate(VariantGraph against, List<? extends Iterable<Token>> witnesses) { |
| 64 | + for (Iterable<Token> witness : witnesses) { |
| 65 | + if (LOG.isLoggable(Level.FINE)) { |
| 66 | + LOG.log(Level.FINE, "heap space: {0}/{1}", new Object[]{ |
| 67 | + Runtime.getRuntime().totalMemory(), |
| 68 | + Runtime.getRuntime().maxMemory() |
| 69 | + }); |
| 70 | + } |
| 71 | + collate(against, witness); |
| 72 | + } |
| 73 | + } |
| 74 | + |
| 75 | + protected void merge(VariantGraph into, Iterable<Token> witnessTokens, Map<Token, VariantGraph.Vertex> alignments) { |
| 76 | + final Witness witness = StreamSupport.stream(witnessTokens.spliterator(), false) |
| 77 | + .findFirst() |
| 78 | + .map(Token::getWitness) |
| 79 | + .orElseThrow(() -> new IllegalArgumentException("Empty witness")); |
| 80 | + |
| 81 | + if (LOG.isLoggable(Level.FINE)) { |
| 82 | + LOG.log(Level.FINE, "{0} + {1}: Merge comparand into graph", new Object[]{into, witness}); |
| 83 | + } |
| 84 | + witnessTokenVertices = new HashMap<>(); |
| 85 | + VariantGraph.Vertex last = into.getStart(); |
| 86 | + final Set<Witness> witnessSet = Collections.singleton(witness); |
| 87 | + for (Token token : witnessTokens) { |
| 88 | + VariantGraph.Vertex matchingVertex = alignments.get(token); |
| 89 | + if (matchingVertex == null) { |
| 90 | + matchingVertex = into.add(token); |
| 91 | + } else { |
| 92 | + if (LOG.isLoggable(Level.FINE)) { |
| 93 | + LOG.log(Level.FINE, "Match: {0} to {1}", new Object[]{matchingVertex, token}); |
| 94 | + } |
| 95 | + matchingVertex.add(Collections.singleton(token)); |
| 96 | + } |
| 97 | + witnessTokenVertices.put(token, matchingVertex); |
| 98 | + |
| 99 | + into.connect(last, matchingVertex, witnessSet); |
| 100 | + last = matchingVertex; |
| 101 | + } |
| 102 | + into.connect(last, into.getEnd(), witnessSet); |
| 103 | + } |
| 104 | + |
| 105 | + protected void mergeTranspositions(VariantGraph into, Iterable<SortedSet<VertexMatch.WithToken>> transpositions) { |
| 106 | + for (SortedSet<VertexMatch.WithToken> transposedPhrase : transpositions) { |
| 107 | + if (LOG.isLoggable(Level.FINE)) { |
| 108 | + LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase); |
| 109 | + } |
| 110 | + final Set<VariantGraph.Vertex> transposed = new HashSet<>(); |
| 111 | + for (VertexMatch.WithToken match : transposedPhrase) { |
| 112 | + transposed.add(witnessTokenVertices.get(match.token)); |
| 113 | + transposed.add(match.vertex); |
| 114 | + } |
| 115 | + into.transpose(transposed); |
| 116 | + } |
| 117 | + } |
| 118 | + |
| 119 | + protected void mergeTranspositions(VariantGraph into, List<List<Match>> transpositions) { |
| 120 | + for (List<Match> transposedPhrase : transpositions) { |
| 121 | + if (LOG.isLoggable(Level.FINE)) { |
| 122 | + LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase); |
| 123 | + } |
| 124 | + final Set<VariantGraph.Vertex> transposed = new HashSet<>(); |
| 125 | + for (Match match : transposedPhrase) { |
| 126 | + transposed.add(witnessTokenVertices.get(match.token)); |
| 127 | + transposed.add(match.vertex); |
| 128 | + } |
| 129 | + into.transpose(transposed); |
| 130 | + } |
| 131 | + } |
| 132 | + |
| 133 | + protected void merge(VariantGraph graph, VariantGraph.Vertex[][] vertices, Token[] tokens, SortedSet<SortedSet<VertexMatch.WithTokenIndex>> matches) { |
| 134 | + @SuppressWarnings("unchecked") |
| 135 | + final SortedSet<VertexMatch.WithTokenIndex>[] matchesVertexOrder = matches.toArray(new SortedSet[matches.size()]); |
| 136 | + final SortedSet<VertexMatch.WithTokenIndex>[] matchesTokenOrder = Arrays.copyOf(matchesVertexOrder, matchesVertexOrder.length); |
| 137 | + |
| 138 | + Arrays.sort(matchesTokenOrder, Comparator.comparing(m -> m.first().token)); |
| 139 | + |
| 140 | + final Set<SortedSet<VertexMatch.WithTokenIndex>> alignedMatches = NeedlemanWunschAlgorithm.align( |
| 141 | + matchesVertexOrder, |
| 142 | + matchesTokenOrder, |
| 143 | + new MatchPhraseAlignmentScorer(Math.max(tokens.length, vertices.length)) |
| 144 | + ).keySet(); |
| 145 | + |
| 146 | + final Map<Token, VariantGraph.Vertex> alignments = matches.stream() |
| 147 | + .filter(alignedMatches::contains) |
| 148 | + .flatMap(Set::stream) |
| 149 | + .collect(Collectors.toMap(m -> tokens[m.token], m -> m.vertex)); |
| 150 | + |
| 151 | + final List<SortedSet<VertexMatch.WithToken>> transpositions = matches.stream() |
| 152 | + .filter(m -> !alignedMatches.contains(m)) |
| 153 | + .map(t -> t.stream().map(m -> new VertexMatch.WithToken(m.vertex, m.vertexRank, tokens[m.token])).collect(Collectors.toCollection(TreeSet::new))) |
| 154 | + .collect(Collectors.toList()); |
| 155 | + |
| 156 | + merge(graph, Arrays.asList(tokens), alignments); |
| 157 | + mergeTranspositions(graph, transpositions); |
| 158 | + } |
| 159 | + } |
| 160 | + |
| 161 | + static class MatchPhraseAlignmentScorer implements NeedlemanWunschScorer<SortedSet<VertexMatch.WithTokenIndex>, SortedSet<VertexMatch.WithTokenIndex>> { |
| 162 | + |
| 163 | + private final int maxWitnessLength; |
| 164 | + |
| 165 | + public MatchPhraseAlignmentScorer(int maxWitnessLength) { |
| 166 | + this.maxWitnessLength = maxWitnessLength; |
| 167 | + } |
| 168 | + |
| 169 | + @Override |
| 170 | + public float score(SortedSet<VertexMatch.WithTokenIndex> a, SortedSet<VertexMatch.WithTokenIndex> b) { |
| 171 | + return (a.equals(b) ? 1 : -maxWitnessLength); |
| 172 | + } |
| 173 | + |
| 174 | + @Override |
| 175 | + public float gap() { |
| 176 | + return -(1 / (maxWitnessLength * 1.0f)); |
| 177 | + } |
| 178 | + |
| 179 | + } |
| 180 | +} |
0 commit comments