/* vim: set tabstop=4 shiftwidth=4: */ /* * Word breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * * Copyright (C) 2011-2011 Tom Hacohen * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute * it freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must * not claim that you wrote the original software. If you use this * software in a product, an acknowledgement in the product * documentation would be appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must * not be misrepresented as being the original software. * 3. This notice may not be removed or altered from any source * distribution. * * The main reference is Unicode Standard Annex 29 (UAX #29): * * * When this library was designed, this annex was at Revision 17, for * Unicode 6.0.0: * * * The Unicode Terms of Use are available at * */ /** * @file wordbreak.c * * Implementation of the word breaking algorithm as described in Unicode * Standard Annex 29. * * @version 2.0, 2011/12/12 * @author Tom Hacohen */ #include #include #include #include "linebreak.h" #include "linebreakdef.h" #include "wordbreak.h" #include "wordbreakdata.x" #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) /* Init the wordbreak internals. */ void init_wordbreak(void) { /* Currently does nothing, may be needed in the future. */ return; } /** * Gets the word breaking class of a character. * * @param ch character to check * @param wbp pointer to the wbp breaking properties array * @param len the size of the wbp array in number of items. * @return the word breaking class if found; \c WBP_Any otherwise */ static enum WordBreakClass get_char_wb_class( utf32_t ch, struct WordBreakProperties *wbp, size_t len) { int min = 0; int max = len - 1; int mid; do { mid = (min + max) / 2; if (ch < wbp[mid].start) max = mid - 1; else if (ch > wbp[mid].end) min = mid + 1; else return wbp[mid].prop; } while (min <= max); return WBP_Any; } /** * Sets the break types in brks starting from posLast up to posStop. * * It sets the inside chars to #WORDBREAK_INSIDECHAR and the rest to brkType. * Assumes brks is initialized - all the cells with #WORDBREAK_NOBREAK are * cells that we really don't want to break after. * * @param s the string * @param brks[out] the breaks array to fill. * @param posStart the start position * @param posEnd the end position * @param len the length of the string * @param brkType the breaks type to use * @param get_next_char function to get the next UTF-32 character */ static void set_brks_to(const void *s, char *brks, size_t posStart, size_t posEnd, size_t len, char brkType, get_next_char_t get_next_char) { size_t posCur = posStart; while (posCur < posEnd) { get_next_char(s, len, &posCur); for ( ; posStart < posCur - 1; ++posStart) { brks[posStart] = WORDBREAK_INSIDECHAR; } assert(posStart == posCur - 1); /* Only set it if we haven't set it not to break before. */ if (brks[posStart] != WORDBREAK_NOBREAK) brks[posStart] = brkType; posStart = posCur; } } /* Checks to see if newline, cr, or lf. for WB3a and b */ #define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \ (cls == WBP_LF)) /** * Sets the word breaking information for a generic input string. * * @param[in] s input string * @param[in] len length of the input * @param[in] lang language of the input * @param[out] brks pointer to the output breaking data, containing * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or * #WORDBREAK_INSIDEACHAR * @param[in] get_next_char function to get the next UTF-32 character */ static void set_wordbreaks( const void *s, size_t len, const char *lang, char *brks, get_next_char_t get_next_char) { /* Previous class */ enum WordBreakClass p_cls = WBP_Undefined; /* Strong previous class. */ enum WordBreakClass sp_cls = WBP_Undefined; utf32_t ch; size_t posCur = 0; size_t posCurSt = 0; size_t posLast = 0; /* FIXME: unused atm. */ (void) lang; /* Init brks */ memset(brks, WORDBREAK_BREAK, len); ch = get_next_char(s, len, &posCur); /* WB3a, WB3b are implied. */ for ( ; ch != EOS ; ) { /* Current class */ enum WordBreakClass c_cls; c_cls = get_char_wb_class(ch, wb_prop_default, ARRAY_LEN(wb_prop_default)); switch (c_cls) { case WBP_CR: set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; break; case WBP_LF: if (sp_cls == WBP_CR) /* WB3 */ { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; } sp_cls = c_cls; posLast = posCurSt; break; case WBP_Newline: /* WB3a, WB3b */ set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; break; case WBP_Extend: case WBP_Format: /* WB4 - If not the first char/after a newline (W3ab), * skip this class, set it to be the same as the prev, and mark * brks not to break before them. */ if ((sp_cls == WBP_Undefined) || IS_WB3ab(sp_cls)) { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; } else { /* It's surely not the first */ brks[posCurSt - 1] = WORDBREAK_NOBREAK; /* "inherit" the previous class. */ c_cls = p_cls; } break; case WBP_Katakana: if ((sp_cls == WBP_Katakana) || /* WB13 */ (sp_cls == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); } sp_cls = c_cls; posLast = posCurSt; break; case WBP_ALetter: if ((sp_cls == WBP_ALetter) || /* WB5,6,7 */ ((sp_cls == WBP_Numeric) && (p_cls == WBP_Numeric)) || /* WB10 */ (sp_cls == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); } sp_cls = c_cls; posLast = posCurSt; break; case WBP_MidNumLet: if ((p_cls == WBP_ALetter) || /* WBP6,7 */ (p_cls == WBP_Numeric)) /* WBP11,12 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; } break; case WBP_MidLetter: if (p_cls == WBP_ALetter) /* WBP6,7 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; } break; case WBP_MidNum: if (p_cls == WBP_Numeric) /* WBP11,12 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; } break; case WBP_Numeric: if ((sp_cls == WBP_Numeric) || /* WB8,11,12 */ ((sp_cls == WBP_ALetter) && (p_cls == WBP_ALetter)) || /* WB9 */ (sp_cls == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); } sp_cls = c_cls; posLast = posCurSt; break; case WBP_ExtendNumLet: /* WB13a,13b */ if ((sp_cls == p_cls) && ((p_cls == WBP_ALetter) || (p_cls == WBP_Numeric) || (p_cls == WBP_Katakana) || (p_cls == WBP_ExtendNumLet))) { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); } sp_cls = c_cls; posLast = posCurSt; break; case WBP_Any: /* Allow breaks and reset */ set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; break; default: /* Error, should never get here! */ assert(0); break; } p_cls = c_cls; posCurSt = posCur; ch = get_next_char(s, len, &posCur); } /* WB2 */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } /** * Sets the word breaking information for a UTF-8 input string. * * @param[in] s input UTF-8 string * @param[in] len length of the input * @param[in] lang language of the input * @param[out] brks pointer to the output breaking data, containing * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or * #WORDBREAK_INSIDEACHAR */ void set_wordbreaks_utf8( const utf8_t *s, size_t len, const char *lang, char *brks) { set_wordbreaks(s, len, lang, brks, (get_next_char_t)lb_get_next_char_utf8); } /** * Sets the word breaking information for a UTF-16 input string. * * @param[in] s input UTF-16 string * @param[in] len length of the input * @param[in] lang language of the input * @param[out] brks pointer to the output breaking data, containing * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or * #WORDBREAK_INSIDEACHAR */ void set_wordbreaks_utf16( const utf16_t *s, size_t len, const char *lang, char *brks) { set_wordbreaks(s, len, lang, brks, (get_next_char_t)lb_get_next_char_utf16); } /** * Sets the word breaking information for a UTF-32 input string. * * @param[in] s input UTF-32 string * @param[in] len length of the input * @param[in] lang language of the input * @param[out] brks pointer to the output breaking data, containing * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or * #WORDBREAK_INSIDEACHAR */ void set_wordbreaks_utf32( const utf32_t *s, size_t len, const char *lang, char *brks) { set_wordbreaks(s, len, lang, brks, (get_next_char_t)lb_get_next_char_utf32); }