/** * * Copyright (c) 2009-2010, Kitty Barnett * * The source code in this file is provided to you under the terms of the * GNU General Public License, version 2.0, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. Terms of the GPL can be found in doc/GPL-license.txt * in this distribution, or online at http://www.gnu.org/licenses/gpl-2.0.txt * * By copying, modifying or distributing this software, you acknowledge that * you have read and understood your obligations described above, and agree to * abide by those obligations. * */ #include "llviewerprecompiledheaders.h" #include "rlvmultistringsearch.h" // ==================================================================================== #ifndef RLV_LOWORD #define RLV_LOWORD(x) ( (U16)( ((U32)x) & 0xFFFF) ) #endif // RLV_LOWORD #ifndef RLV_HIWORD #define RLV_HIWORD(x) ( (U16)( (((U32)x) >> 16) & 0xFFFF) ) #endif // RLV_HIWORD // ==================================================================================== // (TODO-RLV: oops, forgot I was experimenting with word matching, get rid of that again?) #define isLetter(ch) \ ( ( (ch >= 'a') && (ch <= 'z') ) || ( (ch >= 'A') && (ch <= 'Z') ) ) RlvMultiStringSearch::RlvMultiStringSearch() : m_FSM(256), // Start our FSM with room for 256 states (enough for all attachment point names) m_cntState(0) { } void RlvMultiStringSearch::addKeyword(const std::string& strKeyword, U16 nParam) { U16 nCurState = 0; // Always start the loop at state 0 // // Make sure there are enough unused rows to accomodate the worst case (== strKeyword.length() new states) // size_t nMaxState = m_FSM.getSize(); if (m_cntState + strKeyword.length() > nMaxState) // Allocate enough new rows (in batches of 256 rows) m_FSM.resize(nMaxState + ((strKeyword.length() / 256) + 1) * 256); // // Walk the string character by character // for (int idxCh = 0, cntCh = strKeyword.length(); idxCh < cntCh; idxCh++) { // Look up the next state for current character unsigned char ch = strKeyword[idxCh]; U16 nState = RLV_LOWORD(m_FSM[nCurState][ch]); // If we're at the last character in the keyword then set the termination bit if (cntCh - 1 == idxCh) { // (Only set the termination bit for the state because this keyword might be a substring of another keyword) m_FSM[nCurState][ch] = (nParam << 16) | (nState | 0x8000); } else if ( (nState & 0x7FFF) == 0 ) // If the new state is 0 then we're creating a new path { // (Preserve the termination bit because another keyword might be a substring of this one) nState = ++m_cntState | (nState & 0x8000); // Store the new path in the FSM //m_FSM[nCurState][ch] = (nParam << 16) | nState; m_FSM[nCurState][ch] |= nState; } nCurState = nState & 0x7FFF; // Mask out the termination bit since we never need it for the current state } } // (Iterating over a "const char*" is *significantly* faster than "std::string") bool RlvMultiStringSearch::findNext(const char* pstrText, int idxCh, int cntCh, RlvMultiStringSearchMatch& match, bool fWordMatch) const { U16 nCurState = 0; // Always start the loop at state 0 U32 nLastMatch = 0; // Holds the state of the last (possibly partial) keyword match // // Walk the string character by character // for (; idxCh < cntCh; idxCh++) { // Keep track of the current state in case we need to backtrack U16 nPrevState = nCurState; // If we're currently in state 0, save the current character index (for backtracking or as keyword index match) if (nCurState == 0) match.idxMatch = idxCh; // Look up the current character in the FSM unsigned char ch = (unsigned char)pstrText[idxCh]; U32 nCell = m_FSM[nCurState & 0x7FFF][ch]; // If the termination bit is set then we found a keyword substring match // If the next state is non-zero then we can't stop yet because the matched keyword might be a substring of another keyword if (nCell & 0x8000) { if ( 0 == (nCell & 0x7FFF) ) { // Termination bit with 'next state' equal to 0: matched keyword which isn't also a substring of any other keyword match.lenMatch = idxCh - match.idxMatch + 1; match.nParam = RLV_HIWORD(nCell); // Rudimentary word matching: check if the match is a 'word' if ( (!fWordMatch) || ( ( (0 == match.idxMatch) || (!isLetter(pstrText[match.idxMatch - 1])) ) && // Start of string OR non-letter ( (!isLetter(pstrText[match.idxMatch + match.lenMatch])) ) ) ) { return true; } // Not a word, but there's no need to backtrack: we can move on from the character after the current one nCell = 0; // Will set nCurState == 0 further down match.idxMatch = idxCh; // Makes sure we move on to the next character instead of backtracking } else { nLastMatch = nCell; // In case it turns out that we need to backtrack and return this match, save the length of this match match.lenMatch = idxCh - match.idxMatch + 1; } } nCurState = RLV_LOWORD(nCell); // If our new state is 0, but our previous state wasn't, then we followed a false lead and need to backtrack if ( (nPrevState != 0) && (nCurState == 0) ) { // * if nLastMatch == 0 then we need to backtrack and keep going // * if nLastMatch != 0 then we previously encountered a keyword match so return that one if (nLastMatch) { // Rudimentary word matching: check if the match is a 'word' if ( (!fWordMatch) || ( ( (0 == match.idxMatch) || (!isLetter(pstrText[match.idxMatch - 1])) ) && // Start of string OR non-letter ( (!isLetter(pstrText[match.idxMatch + match.lenMatch])) ) ) ) { match.nParam = RLV_HIWORD(nLastMatch); return true; } else // Not a word match, so throw away this partial match and backtrack nLastMatch = 0; } idxCh = match.idxMatch; } } // We encountered a match, but while investigating whether it was a substring of another keyword we ran out of characters if (nLastMatch) { // Rudimentary word matching: check if we started at the beginning of a word (we know the one behind us is '\0') if ( (!fWordMatch) || ( (0 == match.idxMatch) || (!isLetter(pstrText[match.idxMatch - 1])) ) ) { match.nParam = RLV_HIWORD(nLastMatch); return true; } } // None of the keywords is contained in the string: return failure match.idxMatch = -1; return false; } bool RlvMultiStringSearch::findLast(const std::string& strText, RlvMultiStringSearchMatch& match) const { RlvMultiStringSearchMatch matchTemp; match.idxMatch = -1; // (Needed to make the return work in case we don't find anything) matchTemp.lenMatch = 0; // (Needed to make the first loop iteration start at 0) // Iterating over a "const char*" is *significantly* faster than "std::string" const char* pstrText = strText.c_str(); int lenText = strText.length(); while (findNext(pstrText, matchTemp.idxMatch + matchTemp.lenMatch + 1, lenText, matchTemp)) match = matchTemp; return (match.idxMatch != -1); } std::vector RlvMultiStringSearch::findAll(const std::string& strText) { std::vector arMatch; RlvMultiStringSearchMatch match; match.lenMatch = 0; // (Needed to make the first loop iteration start at 0) while (findNext(strText, match.idxMatch + match.lenMatch + 1, match)) arMatch.push_back(match); return arMatch; } // ====================================================================================