diff options
Diffstat (limited to '')
-rw-r--r-- | libraries/evas/src/static_deps/liblinebreak/wordbreak.c | 435 |
1 files changed, 435 insertions, 0 deletions
diff --git a/libraries/evas/src/static_deps/liblinebreak/wordbreak.c b/libraries/evas/src/static_deps/liblinebreak/wordbreak.c new file mode 100644 index 0000000..bbbb7f4 --- /dev/null +++ b/libraries/evas/src/static_deps/liblinebreak/wordbreak.c | |||
@@ -0,0 +1,435 @@ | |||
1 | /* vim: set tabstop=4 shiftwidth=4: */ | ||
2 | |||
3 | /* | ||
4 | * Word breaking in a Unicode sequence. Designed to be used in a | ||
5 | * generic text renderer. | ||
6 | * | ||
7 | * Copyright (C) 2011-2011 Tom Hacohen <tom@stosb.com> | ||
8 | * | ||
9 | * This software is provided 'as-is', without any express or implied | ||
10 | * warranty. In no event will the author be held liable for any damages | ||
11 | * arising from the use of this software. | ||
12 | * | ||
13 | * Permission is granted to anyone to use this software for any purpose, | ||
14 | * including commercial applications, and to alter it and redistribute | ||
15 | * it freely, subject to the following restrictions: | ||
16 | * | ||
17 | * 1. The origin of this software must not be misrepresented; you must | ||
18 | * not claim that you wrote the original software. If you use this | ||
19 | * software in a product, an acknowledgement in the product | ||
20 | * documentation would be appreciated but is not required. | ||
21 | * 2. Altered source versions must be plainly marked as such, and must | ||
22 | * not be misrepresented as being the original software. | ||
23 | * 3. This notice may not be removed or altered from any source | ||
24 | * distribution. | ||
25 | * | ||
26 | * The main reference is Unicode Standard Annex 29 (UAX #29): | ||
27 | * <URL:http://unicode.org/reports/tr29> | ||
28 | * | ||
29 | * When this library was designed, this annex was at Revision 17, for | ||
30 | * Unicode 6.0.0: | ||
31 | * <URL:http://www.unicode.org/reports/tr29/tr29-17.html> | ||
32 | * | ||
33 | * The Unicode Terms of Use are available at | ||
34 | * <URL:http://www.unicode.org/copyright.html> | ||
35 | */ | ||
36 | |||
37 | /** | ||
38 | * @file wordbreak.c | ||
39 | * | ||
40 | * Implementation of the word breaking algorithm as described in Unicode | ||
41 | * Standard Annex 29. | ||
42 | * | ||
43 | * @version 2.0, 2011/12/12 | ||
44 | * @author Tom Hacohen | ||
45 | */ | ||
46 | |||
47 | |||
48 | #include <assert.h> | ||
49 | #include <stddef.h> | ||
50 | #include <string.h> | ||
51 | #include "linebreak.h" | ||
52 | #include "linebreakdef.h" | ||
53 | |||
54 | #include "wordbreak.h" | ||
55 | #include "wordbreakdata.x" | ||
56 | |||
57 | #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) | ||
58 | |||
59 | /* Init the wordbreak internals. */ | ||
60 | void init_wordbreak(void) | ||
61 | { | ||
62 | /* Currently does nothing, may be needed in the future. */ | ||
63 | return; | ||
64 | } | ||
65 | |||
66 | /** | ||
67 | * Gets the word breaking class of a character. | ||
68 | * | ||
69 | * @param ch character to check | ||
70 | * @param wbp pointer to the wbp breaking properties array | ||
71 | * @param len the size of the wbp array in number of items. | ||
72 | * @return the word breaking class if found; \c WBP_Any otherwise | ||
73 | */ | ||
74 | static enum WordBreakClass get_char_wb_class( | ||
75 | utf32_t ch, | ||
76 | struct WordBreakProperties *wbp, | ||
77 | size_t len) | ||
78 | { | ||
79 | int min = 0; | ||
80 | int max = len - 1; | ||
81 | int mid; | ||
82 | |||
83 | do | ||
84 | { | ||
85 | mid = (min + max) / 2; | ||
86 | |||
87 | if (ch < wbp[mid].start) | ||
88 | max = mid - 1; | ||
89 | else if (ch > wbp[mid].end) | ||
90 | min = mid + 1; | ||
91 | else | ||
92 | return wbp[mid].prop; | ||
93 | } | ||
94 | while (min <= max); | ||
95 | |||
96 | return WBP_Any; | ||
97 | } | ||
98 | |||
99 | /** | ||
100 | * Sets the break types in brks starting from posLast up to posStop. | ||
101 | * | ||
102 | * It sets the inside chars to #WORDBREAK_INSIDECHAR and the rest to brkType. | ||
103 | * Assumes brks is initialized - all the cells with #WORDBREAK_NOBREAK are | ||
104 | * cells that we really don't want to break after. | ||
105 | * | ||
106 | * @param s the string | ||
107 | * @param brks[out] the breaks array to fill. | ||
108 | * @param posStart the start position | ||
109 | * @param posEnd the end position | ||
110 | * @param len the length of the string | ||
111 | * @param brkType the breaks type to use | ||
112 | * @param get_next_char function to get the next UTF-32 character | ||
113 | */ | ||
114 | static void set_brks_to(const void *s, | ||
115 | char *brks, | ||
116 | size_t posStart, | ||
117 | size_t posEnd, | ||
118 | size_t len, | ||
119 | char brkType, | ||
120 | get_next_char_t get_next_char) | ||
121 | { | ||
122 | size_t posCur = posStart; | ||
123 | while (posCur < posEnd) | ||
124 | { | ||
125 | get_next_char(s, len, &posCur); | ||
126 | for ( ; posStart < posCur - 1; ++posStart) | ||
127 | { | ||
128 | brks[posStart] = WORDBREAK_INSIDECHAR; | ||
129 | } | ||
130 | assert(posStart == posCur - 1); | ||
131 | |||
132 | /* Only set it if we haven't set it not to break before. */ | ||
133 | if (brks[posStart] != WORDBREAK_NOBREAK) | ||
134 | brks[posStart] = brkType; | ||
135 | posStart = posCur; | ||
136 | } | ||
137 | } | ||
138 | |||
139 | /* Checks to see if newline, cr, or lf. for WB3a and b */ | ||
140 | #define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \ | ||
141 | (cls == WBP_LF)) | ||
142 | |||
143 | /** | ||
144 | * Sets the word breaking information for a generic input string. | ||
145 | * | ||
146 | * @param[in] s input string | ||
147 | * @param[in] len length of the input | ||
148 | * @param[in] lang language of the input | ||
149 | * @param[out] brks pointer to the output breaking data, containing | ||
150 | * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or | ||
151 | * #WORDBREAK_INSIDEACHAR | ||
152 | * @param[in] get_next_char function to get the next UTF-32 character | ||
153 | */ | ||
154 | static void set_wordbreaks( | ||
155 | const void *s, | ||
156 | size_t len, | ||
157 | const char *lang, | ||
158 | char *brks, | ||
159 | get_next_char_t get_next_char) | ||
160 | { | ||
161 | /* Previous class */ | ||
162 | enum WordBreakClass p_cls = WBP_Undefined; | ||
163 | /* Strong previous class. */ | ||
164 | enum WordBreakClass sp_cls = WBP_Undefined; | ||
165 | utf32_t ch; | ||
166 | size_t posCur = 0; | ||
167 | size_t posCurSt = 0; | ||
168 | size_t posLast = 0; | ||
169 | |||
170 | /* FIXME: unused atm. */ | ||
171 | (void) lang; | ||
172 | |||
173 | |||
174 | /* Init brks */ | ||
175 | memset(brks, WORDBREAK_BREAK, len); | ||
176 | |||
177 | ch = get_next_char(s, len, &posCur); | ||
178 | |||
179 | /* WB3a, WB3b are implied. */ | ||
180 | for ( ; ch != EOS ; ) | ||
181 | { | ||
182 | /* Current class */ | ||
183 | enum WordBreakClass c_cls; | ||
184 | c_cls = get_char_wb_class(ch, wb_prop_default, | ||
185 | ARRAY_LEN(wb_prop_default)); | ||
186 | |||
187 | switch (c_cls) | ||
188 | { | ||
189 | case WBP_CR: | ||
190 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, | ||
191 | get_next_char); | ||
192 | sp_cls = c_cls; | ||
193 | posLast = posCurSt; | ||
194 | break; | ||
195 | |||
196 | case WBP_LF: | ||
197 | if (sp_cls == WBP_CR) /* WB3 */ | ||
198 | { | ||
199 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, | ||
200 | get_next_char); | ||
201 | sp_cls = c_cls; | ||
202 | posLast = posCurSt; | ||
203 | } | ||
204 | sp_cls = c_cls; | ||
205 | posLast = posCurSt; | ||
206 | break; | ||
207 | |||
208 | case WBP_Newline: | ||
209 | /* WB3a, WB3b */ | ||
210 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, | ||
211 | get_next_char); | ||
212 | sp_cls = c_cls; | ||
213 | posLast = posCurSt; | ||
214 | break; | ||
215 | |||
216 | case WBP_Extend: | ||
217 | case WBP_Format: | ||
218 | /* WB4 - If not the first char/after a newline (W3ab), | ||
219 | * skip this class, set it to be the same as the prev, and mark | ||
220 | * brks not to break before them. */ | ||
221 | if ((sp_cls == WBP_Undefined) || IS_WB3ab(sp_cls)) | ||
222 | { | ||
223 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, | ||
224 | get_next_char); | ||
225 | sp_cls = c_cls; | ||
226 | } | ||
227 | else | ||
228 | { | ||
229 | /* It's surely not the first */ | ||
230 | brks[posCurSt - 1] = WORDBREAK_NOBREAK; | ||
231 | /* "inherit" the previous class. */ | ||
232 | c_cls = p_cls; | ||
233 | } | ||
234 | break; | ||
235 | |||
236 | case WBP_Katakana: | ||
237 | if ((sp_cls == WBP_Katakana) || /* WB13 */ | ||
238 | (sp_cls == WBP_ExtendNumLet)) /* WB13b */ | ||
239 | { | ||
240 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, | ||
241 | get_next_char); | ||
242 | } | ||
243 | /* No rule found, reset */ | ||
244 | else | ||
245 | { | ||
246 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, | ||
247 | get_next_char); | ||
248 | } | ||
249 | sp_cls = c_cls; | ||
250 | posLast = posCurSt; | ||
251 | break; | ||
252 | |||
253 | case WBP_ALetter: | ||
254 | if ((sp_cls == WBP_ALetter) || /* WB5,6,7 */ | ||
255 | ((sp_cls == WBP_Numeric) && (p_cls == WBP_Numeric)) || /* WB10 */ | ||
256 | (sp_cls == WBP_ExtendNumLet)) /* WB13b */ | ||
257 | { | ||
258 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, | ||
259 | get_next_char); | ||
260 | } | ||
261 | /* No rule found, reset */ | ||
262 | else | ||
263 | { | ||
264 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, | ||
265 | get_next_char); | ||
266 | } | ||
267 | sp_cls = c_cls; | ||
268 | posLast = posCurSt; | ||
269 | break; | ||
270 | |||
271 | case WBP_MidNumLet: | ||
272 | if ((p_cls == WBP_ALetter) || /* WBP6,7 */ | ||
273 | (p_cls == WBP_Numeric)) /* WBP11,12 */ | ||
274 | { | ||
275 | /* Go on */ | ||
276 | } | ||
277 | else | ||
278 | { | ||
279 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, | ||
280 | get_next_char); | ||
281 | sp_cls = c_cls; | ||
282 | posLast = posCurSt; | ||
283 | } | ||
284 | break; | ||
285 | |||
286 | case WBP_MidLetter: | ||
287 | if (p_cls == WBP_ALetter) /* WBP6,7 */ | ||
288 | { | ||
289 | /* Go on */ | ||
290 | } | ||
291 | else | ||
292 | { | ||
293 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, | ||
294 | get_next_char); | ||
295 | sp_cls = c_cls; | ||
296 | posLast = posCurSt; | ||
297 | } | ||
298 | break; | ||
299 | |||
300 | case WBP_MidNum: | ||
301 | if (p_cls == WBP_Numeric) /* WBP11,12 */ | ||
302 | { | ||
303 | /* Go on */ | ||
304 | } | ||
305 | else | ||
306 | { | ||
307 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, | ||
308 | get_next_char); | ||
309 | sp_cls = c_cls; | ||
310 | posLast = posCurSt; | ||
311 | } | ||
312 | break; | ||
313 | |||
314 | case WBP_Numeric: | ||
315 | if ((sp_cls == WBP_Numeric) || /* WB8,11,12 */ | ||
316 | ((sp_cls == WBP_ALetter) && (p_cls == WBP_ALetter)) || /* WB9 */ | ||
317 | (sp_cls == WBP_ExtendNumLet)) /* WB13b */ | ||
318 | { | ||
319 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, | ||
320 | get_next_char); | ||
321 | } | ||
322 | /* No rule found, reset */ | ||
323 | else | ||
324 | { | ||
325 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, | ||
326 | get_next_char); | ||
327 | } | ||
328 | sp_cls = c_cls; | ||
329 | posLast = posCurSt; | ||
330 | break; | ||
331 | |||
332 | case WBP_ExtendNumLet: | ||
333 | /* WB13a,13b */ | ||
334 | if ((sp_cls == p_cls) && | ||
335 | ((p_cls == WBP_ALetter) || | ||
336 | (p_cls == WBP_Numeric) || | ||
337 | (p_cls == WBP_Katakana) || | ||
338 | (p_cls == WBP_ExtendNumLet))) | ||
339 | { | ||
340 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, | ||
341 | get_next_char); | ||
342 | } | ||
343 | /* No rule found, reset */ | ||
344 | else | ||
345 | { | ||
346 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, | ||
347 | get_next_char); | ||
348 | } | ||
349 | sp_cls = c_cls; | ||
350 | posLast = posCurSt; | ||
351 | break; | ||
352 | |||
353 | case WBP_Any: | ||
354 | /* Allow breaks and reset */ | ||
355 | set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, | ||
356 | get_next_char); | ||
357 | sp_cls = c_cls; | ||
358 | posLast = posCurSt; | ||
359 | break; | ||
360 | |||
361 | default: | ||
362 | /* Error, should never get here! */ | ||
363 | assert(0); | ||
364 | break; | ||
365 | } | ||
366 | |||
367 | p_cls = c_cls; | ||
368 | posCurSt = posCur; | ||
369 | ch = get_next_char(s, len, &posCur); | ||
370 | } | ||
371 | |||
372 | /* WB2 */ | ||
373 | set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, | ||
374 | get_next_char); | ||
375 | } | ||
376 | |||
377 | /** | ||
378 | * Sets the word breaking information for a UTF-8 input string. | ||
379 | * | ||
380 | * @param[in] s input UTF-8 string | ||
381 | * @param[in] len length of the input | ||
382 | * @param[in] lang language of the input | ||
383 | * @param[out] brks pointer to the output breaking data, containing | ||
384 | * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or | ||
385 | * #WORDBREAK_INSIDEACHAR | ||
386 | */ | ||
387 | void set_wordbreaks_utf8( | ||
388 | const utf8_t *s, | ||
389 | size_t len, | ||
390 | const char *lang, | ||
391 | char *brks) | ||
392 | { | ||
393 | set_wordbreaks(s, len, lang, brks, | ||
394 | (get_next_char_t)lb_get_next_char_utf8); | ||
395 | } | ||
396 | |||
397 | /** | ||
398 | * Sets the word breaking information for a UTF-16 input string. | ||
399 | * | ||
400 | * @param[in] s input UTF-16 string | ||
401 | * @param[in] len length of the input | ||
402 | * @param[in] lang language of the input | ||
403 | * @param[out] brks pointer to the output breaking data, containing | ||
404 | * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or | ||
405 | * #WORDBREAK_INSIDEACHAR | ||
406 | */ | ||
407 | void set_wordbreaks_utf16( | ||
408 | const utf16_t *s, | ||
409 | size_t len, | ||
410 | const char *lang, | ||
411 | char *brks) | ||
412 | { | ||
413 | set_wordbreaks(s, len, lang, brks, | ||
414 | (get_next_char_t)lb_get_next_char_utf16); | ||
415 | } | ||
416 | |||
417 | /** | ||
418 | * Sets the word breaking information for a UTF-32 input string. | ||
419 | * | ||
420 | * @param[in] s input UTF-32 string | ||
421 | * @param[in] len length of the input | ||
422 | * @param[in] lang language of the input | ||
423 | * @param[out] brks pointer to the output breaking data, containing | ||
424 | * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or | ||
425 | * #WORDBREAK_INSIDEACHAR | ||
426 | */ | ||
427 | void set_wordbreaks_utf32( | ||
428 | const utf32_t *s, | ||
429 | size_t len, | ||
430 | const char *lang, | ||
431 | char *brks) | ||
432 | { | ||
433 | set_wordbreaks(s, len, lang, brks, | ||
434 | (get_next_char_t)lb_get_next_char_utf32); | ||
435 | } | ||