Skip to content

Commit 7b55979

Browse files
archi14jainaman224
authored andcommitted
Aho corasick (jainaman224#1395)
1 parent 6f2bc7a commit 7b55979

File tree

1 file changed

+150
-0
lines changed

1 file changed

+150
-0
lines changed

Aho-Corasick/Aho-Corasick.cpp

+150
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
/*It is a kind of dictionary-matching algorithm that locates elements
2+
of a finite set of strings (the "dictionary") within an input text.
3+
It matches all strings simultaneously.*/
4+
5+
using namespace std;
6+
#include <bits/stdc++.h>
7+
8+
const int MAXSTATE = 6 * 50 + 10;
9+
// Max number of states in the matching machine.
10+
// Should be equal to the sum of the length of all keywords.
11+
12+
const int MAXCHAR = 26;
13+
// Number of characters in the alphabet.
14+
int out[MAXSTATE];
15+
// Output for each state, as a bitwise mask.
16+
int fail[MAXSTATE];
17+
// failure function
18+
int g[MAXSTATE][MAXCHAR];
19+
// goto function, or -1 if fail.
20+
21+
int buildMachine(const vector<string> &words, char lowestChar = 'a', char highestChar = 'z')
22+
{
23+
memset(out, 0, sizeof out); //intializing out with 0s
24+
memset(fail, -1, sizeof fail); //intializing fail with -1s
25+
memset(g, -1, sizeof g); //intializing g with -1s
26+
int states = 1; // Initially, we just have the 0 state
27+
28+
for (int i = 0; i < words.size(); i++)
29+
{
30+
const string &keyword = words[i];
31+
int currentState = 0;
32+
for (int j = 0; j < keyword.size(); j++)
33+
{
34+
int c = keyword[j] - lowestChar;
35+
if (g[currentState][c] == -1)
36+
{
37+
// Allocate a new node
38+
g[currentState][c] = states++;
39+
40+
}
41+
currentState = g[currentState][c];
42+
}
43+
out[currentState] |= (1 << i);
44+
// There's a match of keywords[i] at node currentState.
45+
}
46+
// State 0 should have an outgoing edge for all characters.
47+
for (int c = 0; c < MAXCHAR; c++)
48+
{
49+
if (g[0][c] == -1)
50+
{
51+
g[0][c] = 0;
52+
}
53+
}
54+
//building the failure function
55+
queue<int> q;
56+
for (int c = 0; c <= highestChar - lowestChar; c++)
57+
{
58+
// Iterate over every possible input. All nodes s of depth 1 have fail[s] = 0
59+
if (g[0][c] != -1 && g[0][c] != 0)
60+
{
61+
fail[g[0][c]] = 0;
62+
q.push(g[0][c]);
63+
}
64+
}
65+
66+
while (q.size())
67+
{
68+
int state = q.front();
69+
q.pop();
70+
for (int c = 0; c <= highestChar - lowestChar; c++)
71+
{
72+
if (g[state][c] != -1)
73+
{
74+
int failure = fail[state];
75+
int failure = fail[state];
76+
while (g[failure][c] == -1)
77+
{
78+
failure = fail[failure];
79+
}
80+
failure = g[failure][c];
81+
fail[g[state][c]] = failure;
82+
out[g[state][c]] |= out[failure]; // Merge out values
83+
q.push(g[state][c]);
84+
}
85+
}
86+
}
87+
return states;
88+
}
89+
90+
int findNextState(int currentState, char nextInput, char lowestChar = 'a')
91+
{
92+
int answer = currentState;
93+
int c = nextInput - lowestChar;
94+
while (g[answer][c] == -1)
95+
{
96+
answer = fail[answer];
97+
}
98+
return g[answer][c];
99+
}
100+
101+
int main()
102+
{
103+
vector<string> keywords;
104+
cout<<"Enter the number of keywords you want to enter";
105+
int n;
106+
for (int i = 0; i < n; i++)
107+
{
108+
string temp;
109+
cin >> temp;
110+
keywords.push_back(temp);
111+
}
112+
cout<<"Enter text";
113+
string text;
114+
cin>>text;
115+
buildMachine(keywords, 'a', 'z');
116+
int currentState = 0;
117+
for (int i = 0; i < text.size(); i++)
118+
{
119+
currentState = findNextState(currentState, text[i], 'a');
120+
if (out[currentState] == 0)
121+
{
122+
continue; // Nothing new, moving on to the next character.
123+
}
124+
for (int j = 0; j < keywords.size(); j++)
125+
{
126+
if (out[currentState] & (1 << j))
127+
{
128+
// Matched keywords[j]
129+
cout << "Keyword " << keywords[j] << " appears from " << i - keywords[j].size() + 1 << " to " << i << endl;
130+
}
131+
}
132+
}
133+
return 0;
134+
}
135+
136+
/*
137+
Input
138+
139+
keywords ={"he",she","hers",his"}
140+
text = "ahishers"
141+
142+
Output
143+
144+
Keyword his appears from 1 to 3
145+
Keyword he appears from 4 to 5
146+
Keyword she appears from 3 to 5
147+
Keyword hers appears from 4 to 7
148+
149+
*/
150+
}

0 commit comments

Comments
 (0)