From 825a3d837a33f226c879cd02ad15c3fba57e8b2c Mon Sep 17 00:00:00 2001 From: David Walter Seikel Date: Mon, 23 Jan 2012 23:30:42 +1000 Subject: Update the EFL to what I'm actually using, coz I'm using some stuff not yet released. --- .../evas/src/static_deps/liblinebreak/wordbreak.c | 435 +++++++++++++++++++++ 1 file changed, 435 insertions(+) create mode 100644 libraries/evas/src/static_deps/liblinebreak/wordbreak.c (limited to 'libraries/evas/src/static_deps/liblinebreak/wordbreak.c') diff --git a/libraries/evas/src/static_deps/liblinebreak/wordbreak.c b/libraries/evas/src/static_deps/liblinebreak/wordbreak.c new file mode 100644 index 0000000..bbbb7f4 --- /dev/null +++ b/libraries/evas/src/static_deps/liblinebreak/wordbreak.c @@ -0,0 +1,435 @@ +/* vim: set tabstop=4 shiftwidth=4: */ + +/* + * Word breaking in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2011-2011 Tom Hacohen + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 29 (UAX #29): + * + * + * When this library was designed, this annex was at Revision 17, for + * Unicode 6.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file wordbreak.c + * + * Implementation of the word breaking algorithm as described in Unicode + * Standard Annex 29. + * + * @version 2.0, 2011/12/12 + * @author Tom Hacohen + */ + + +#include +#include +#include +#include "linebreak.h" +#include "linebreakdef.h" + +#include "wordbreak.h" +#include "wordbreakdata.x" + +#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) + +/* Init the wordbreak internals. */ +void init_wordbreak(void) +{ + /* Currently does nothing, may be needed in the future. */ + return; +} + +/** + * Gets the word breaking class of a character. + * + * @param ch character to check + * @param wbp pointer to the wbp breaking properties array + * @param len the size of the wbp array in number of items. + * @return the word breaking class if found; \c WBP_Any otherwise + */ +static enum WordBreakClass get_char_wb_class( + utf32_t ch, + struct WordBreakProperties *wbp, + size_t len) +{ + int min = 0; + int max = len - 1; + int mid; + + do + { + mid = (min + max) / 2; + + if (ch < wbp[mid].start) + max = mid - 1; + else if (ch > wbp[mid].end) + min = mid + 1; + else + return wbp[mid].prop; + } + while (min <= max); + + return WBP_Any; +} + +/** + * Sets the break types in brks starting from posLast up to posStop. + * + * It sets the inside chars to #WORDBREAK_INSIDECHAR and the rest to brkType. + * Assumes brks is initialized - all the cells with #WORDBREAK_NOBREAK are + * cells that we really don't want to break after. + * + * @param s the string + * @param brks[out] the breaks array to fill. + * @param posStart the start position + * @param posEnd the end position + * @param len the length of the string + * @param brkType the breaks type to use + * @param get_next_char function to get the next UTF-32 character + */ +static void set_brks_to(const void *s, + char *brks, + size_t posStart, + size_t posEnd, + size_t len, + char brkType, + get_next_char_t get_next_char) +{ + size_t posCur = posStart; + while (posCur < posEnd) + { + get_next_char(s, len, &posCur); + for ( ; posStart < posCur - 1; ++posStart) + { + brks[posStart] = WORDBREAK_INSIDECHAR; + } + assert(posStart == posCur - 1); + + /* Only set it if we haven't set it not to break before. */ + if (brks[posStart] != WORDBREAK_NOBREAK) + brks[posStart] = brkType; + posStart = posCur; + } +} + +/* Checks to see if newline, cr, or lf. for WB3a and b */ +#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \ + (cls == WBP_LF)) + +/** + * Sets the word breaking information for a generic input string. + * + * @param[in] s input string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + * @param[in] get_next_char function to get the next UTF-32 character + */ +static void set_wordbreaks( + const void *s, + size_t len, + const char *lang, + char *brks, + get_next_char_t get_next_char) +{ + /* Previous class */ + enum WordBreakClass p_cls = WBP_Undefined; + /* Strong previous class. */ + enum WordBreakClass sp_cls = WBP_Undefined; + utf32_t ch; + size_t posCur = 0; + size_t posCurSt = 0; + size_t posLast = 0; + + /* FIXME: unused atm. */ + (void) lang; + + + /* Init brks */ + memset(brks, WORDBREAK_BREAK, len); + + ch = get_next_char(s, len, &posCur); + + /* WB3a, WB3b are implied. */ + for ( ; ch != EOS ; ) + { + /* Current class */ + enum WordBreakClass c_cls; + c_cls = get_char_wb_class(ch, wb_prop_default, + ARRAY_LEN(wb_prop_default)); + + switch (c_cls) + { + case WBP_CR: + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_LF: + if (sp_cls == WBP_CR) /* WB3 */ + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + } + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_Newline: + /* WB3a, WB3b */ + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_Extend: + case WBP_Format: + /* WB4 - If not the first char/after a newline (W3ab), + * skip this class, set it to be the same as the prev, and mark + * brks not to break before them. */ + if ((sp_cls == WBP_Undefined) || IS_WB3ab(sp_cls)) + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + } + else + { + /* It's surely not the first */ + brks[posCurSt - 1] = WORDBREAK_NOBREAK; + /* "inherit" the previous class. */ + c_cls = p_cls; + } + break; + + case WBP_Katakana: + if ((sp_cls == WBP_Katakana) || /* WB13 */ + (sp_cls == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, + get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + } + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_ALetter: + if ((sp_cls == WBP_ALetter) || /* WB5,6,7 */ + ((sp_cls == WBP_Numeric) && (p_cls == WBP_Numeric)) || /* WB10 */ + (sp_cls == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, + get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + } + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_MidNumLet: + if ((p_cls == WBP_ALetter) || /* WBP6,7 */ + (p_cls == WBP_Numeric)) /* WBP11,12 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + } + break; + + case WBP_MidLetter: + if (p_cls == WBP_ALetter) /* WBP6,7 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + } + break; + + case WBP_MidNum: + if (p_cls == WBP_Numeric) /* WBP11,12 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + } + break; + + case WBP_Numeric: + if ((sp_cls == WBP_Numeric) || /* WB8,11,12 */ + ((sp_cls == WBP_ALetter) && (p_cls == WBP_ALetter)) || /* WB9 */ + (sp_cls == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, + get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + } + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_ExtendNumLet: + /* WB13a,13b */ + if ((sp_cls == p_cls) && + ((p_cls == WBP_ALetter) || + (p_cls == WBP_Numeric) || + (p_cls == WBP_Katakana) || + (p_cls == WBP_ExtendNumLet))) + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, + get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + } + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_Any: + /* Allow breaks and reset */ + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + break; + + default: + /* Error, should never get here! */ + assert(0); + break; + } + + p_cls = c_cls; + posCurSt = posCur; + ch = get_next_char(s, len, &posCur); + } + + /* WB2 */ + set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, + get_next_char); +} + +/** + * Sets the word breaking information for a UTF-8 input string. + * + * @param[in] s input UTF-8 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + */ +void set_wordbreaks_utf8( + const utf8_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf8); +} + +/** + * Sets the word breaking information for a UTF-16 input string. + * + * @param[in] s input UTF-16 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + */ +void set_wordbreaks_utf16( + const utf16_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf16); +} + +/** + * Sets the word breaking information for a UTF-32 input string. + * + * @param[in] s input UTF-32 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + */ +void set_wordbreaks_utf32( + const utf32_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf32); +} -- cgit v1.1