1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
|
/**
*
* Copyright (c) 2009-2010, Kitty Barnett
*
* The source code in this file is provided to you under the terms of the
* GNU General Public License, version 2.0, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. Terms of the GPL can be found in doc/GPL-license.txt
* in this distribution, or online at http://www.gnu.org/licenses/gpl-2.0.txt
*
* By copying, modifying or distributing this software, you acknowledge that
* you have read and understood your obligations described above, and agree to
* abide by those obligations.
*
*/
#include "llviewerprecompiledheaders.h"
#include "rlvmultistringsearch.h"
// ====================================================================================
#ifndef RLV_LOWORD
#define RLV_LOWORD(x) ( (U16)( ((U32)x) & 0xFFFF) )
#endif // RLV_LOWORD
#ifndef RLV_HIWORD
#define RLV_HIWORD(x) ( (U16)( (((U32)x) >> 16) & 0xFFFF) )
#endif // RLV_HIWORD
// ====================================================================================
// (TODO-RLV: oops, forgot I was experimenting with word matching, get rid of that again?)
#define isLetter(ch) \
( ( (ch >= 'a') && (ch <= 'z') ) || ( (ch >= 'A') && (ch <= 'Z') ) )
RlvMultiStringSearch::RlvMultiStringSearch()
: m_FSM(256), // Start our FSM with room for 256 states (enough for all attachment point names)
m_cntState(0)
{
}
void RlvMultiStringSearch::addKeyword(const std::string& strKeyword, U16 nParam) {
U16 nCurState = 0; // Always start the loop at state 0
//
// Make sure there are enough unused rows to accomodate the worst case (== strKeyword.length() new states)
//
size_t nMaxState = m_FSM.getSize();
if (m_cntState + strKeyword.length() > nMaxState)
// Allocate enough new rows (in batches of 256 rows)
m_FSM.resize(nMaxState + ((strKeyword.length() / 256) + 1) * 256);
//
// Walk the string character by character
//
for (int idxCh = 0, cntCh = strKeyword.length(); idxCh < cntCh; idxCh++) {
// Look up the next state for current character
unsigned char ch = strKeyword[idxCh];
U16 nState = RLV_LOWORD(m_FSM[nCurState][ch]);
// If we're at the last character in the keyword then set the termination bit
if (cntCh - 1 == idxCh)
{
// (Only set the termination bit for the state because this keyword might be a substring of another keyword)
m_FSM[nCurState][ch] = (nParam << 16) | (nState | 0x8000);
}
else if ( (nState & 0x7FFF) == 0 ) // If the new state is 0 then we're creating a new path
{
// (Preserve the termination bit because another keyword might be a substring of this one)
nState = ++m_cntState | (nState & 0x8000);
// Store the new path in the FSM
//m_FSM[nCurState][ch] = (nParam << 16) | nState;
m_FSM[nCurState][ch] |= nState;
}
nCurState = nState & 0x7FFF; // Mask out the termination bit since we never need it for the current state
}
}
// (Iterating over a "const char*" is *significantly* faster than "std::string")
bool RlvMultiStringSearch::findNext(const char* pstrText, int idxCh, int cntCh, RlvMultiStringSearchMatch& match, bool fWordMatch) const
{
U16 nCurState = 0; // Always start the loop at state 0
U32 nLastMatch = 0; // Holds the state of the last (possibly partial) keyword match
//
// Walk the string character by character
//
for (; idxCh < cntCh; idxCh++)
{
// Keep track of the current state in case we need to backtrack
U16 nPrevState = nCurState;
// If we're currently in state 0, save the current character index (for backtracking or as keyword index match)
if (nCurState == 0)
match.idxMatch = idxCh;
// Look up the current character in the FSM
unsigned char ch = (unsigned char)pstrText[idxCh];
U32 nCell = m_FSM[nCurState & 0x7FFF][ch];
// If the termination bit is set then we found a keyword substring match
// If the next state is non-zero then we can't stop yet because the matched keyword might be a substring of another keyword
if (nCell & 0x8000)
{
if ( 0 == (nCell & 0x7FFF) )
{
// Termination bit with 'next state' equal to 0: matched keyword which isn't also a substring of any other keyword
match.lenMatch = idxCh - match.idxMatch + 1;
match.nParam = RLV_HIWORD(nCell);
// Rudimentary word matching: check if the match is a 'word'
if
(
(!fWordMatch) ||
(
( (0 == match.idxMatch) || (!isLetter(pstrText[match.idxMatch - 1])) ) && // Start of string OR non-letter
( (!isLetter(pstrText[match.idxMatch + match.lenMatch])) )
)
)
{
return true;
}
// Not a word, but there's no need to backtrack: we can move on from the character after the current one
nCell = 0; // Will set nCurState == 0 further down
match.idxMatch = idxCh; // Makes sure we move on to the next character instead of backtracking
}
else
{
nLastMatch = nCell;
// In case it turns out that we need to backtrack and return this match, save the length of this match
match.lenMatch = idxCh - match.idxMatch + 1;
}
}
nCurState = RLV_LOWORD(nCell);
// If our new state is 0, but our previous state wasn't, then we followed a false lead and need to backtrack
if ( (nPrevState != 0) && (nCurState == 0) )
{
// * if nLastMatch == 0 then we need to backtrack and keep going
// * if nLastMatch != 0 then we previously encountered a keyword match so return that one
if (nLastMatch) {
// Rudimentary word matching: check if the match is a 'word'
if
(
(!fWordMatch) ||
(
( (0 == match.idxMatch) || (!isLetter(pstrText[match.idxMatch - 1])) ) && // Start of string OR non-letter
( (!isLetter(pstrText[match.idxMatch + match.lenMatch])) )
)
)
{
match.nParam = RLV_HIWORD(nLastMatch);
return true;
} else
// Not a word match, so throw away this partial match and backtrack
nLastMatch = 0;
}
idxCh = match.idxMatch;
}
}
// We encountered a match, but while investigating whether it was a substring of another keyword we ran out of characters
if (nLastMatch)
{
// Rudimentary word matching: check if we started at the beginning of a word (we know the one behind us is '\0')
if ( (!fWordMatch) || ( (0 == match.idxMatch) || (!isLetter(pstrText[match.idxMatch - 1])) ) )
{
match.nParam = RLV_HIWORD(nLastMatch);
return true;
}
}
// None of the keywords is contained in the string: return failure
match.idxMatch = -1;
return false;
}
bool RlvMultiStringSearch::findLast(const std::string& strText, RlvMultiStringSearchMatch& match) const {
RlvMultiStringSearchMatch matchTemp;
match.idxMatch = -1; // (Needed to make the return work in case we don't find anything)
matchTemp.lenMatch = 0; // (Needed to make the first loop iteration start at 0)
// Iterating over a "const char*" is *significantly* faster than "std::string"
const char* pstrText = strText.c_str();
int lenText = strText.length();
while (findNext(pstrText, matchTemp.idxMatch + matchTemp.lenMatch + 1, lenText, matchTemp))
match = matchTemp;
return (match.idxMatch != -1);
}
std::vector<RlvMultiStringSearchMatch> RlvMultiStringSearch::findAll(const std::string& strText) {
std::vector<RlvMultiStringSearchMatch> arMatch;
RlvMultiStringSearchMatch match;
match.lenMatch = 0; // (Needed to make the first loop iteration start at 0)
while (findNext(strText, match.idxMatch + match.lenMatch + 1, match))
arMatch.push_back(match);
return arMatch;
}
// ====================================================================================
|