aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/evas/src/static_deps/liblinebreak/wordbreak.c
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/evas/src/static_deps/liblinebreak/wordbreak.c')
-rw-r--r--libraries/evas/src/static_deps/liblinebreak/wordbreak.c435
1 files changed, 435 insertions, 0 deletions
diff --git a/libraries/evas/src/static_deps/liblinebreak/wordbreak.c b/libraries/evas/src/static_deps/liblinebreak/wordbreak.c
new file mode 100644
index 0000000..bbbb7f4
--- /dev/null
+++ b/libraries/evas/src/static_deps/liblinebreak/wordbreak.c
@@ -0,0 +1,435 @@
1/* vim: set tabstop=4 shiftwidth=4: */
2
3/*
4 * Word breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
6 *
7 * Copyright (C) 2011-2011 Tom Hacohen <tom@stosb.com>
8 *
9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the author be held liable for any damages
11 * arising from the use of this software.
12 *
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute
15 * it freely, subject to the following restrictions:
16 *
17 * 1. The origin of this software must not be misrepresented; you must
18 * not claim that you wrote the original software. If you use this
19 * software in a product, an acknowledgement in the product
20 * documentation would be appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must
22 * not be misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source
24 * distribution.
25 *
26 * The main reference is Unicode Standard Annex 29 (UAX #29):
27 * <URL:http://unicode.org/reports/tr29>
28 *
29 * When this library was designed, this annex was at Revision 17, for
30 * Unicode 6.0.0:
31 * <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
32 *
33 * The Unicode Terms of Use are available at
34 * <URL:http://www.unicode.org/copyright.html>
35 */
36
37/**
38 * @file wordbreak.c
39 *
40 * Implementation of the word breaking algorithm as described in Unicode
41 * Standard Annex 29.
42 *
43 * @version 2.0, 2011/12/12
44 * @author Tom Hacohen
45 */
46
47
48#include <assert.h>
49#include <stddef.h>
50#include <string.h>
51#include "linebreak.h"
52#include "linebreakdef.h"
53
54#include "wordbreak.h"
55#include "wordbreakdata.x"
56
57#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
58
59/* Init the wordbreak internals. */
60void init_wordbreak(void)
61{
62 /* Currently does nothing, may be needed in the future. */
63 return;
64}
65
66/**
67 * Gets the word breaking class of a character.
68 *
69 * @param ch character to check
70 * @param wbp pointer to the wbp breaking properties array
71 * @param len the size of the wbp array in number of items.
72 * @return the word breaking class if found; \c WBP_Any otherwise
73 */
74static enum WordBreakClass get_char_wb_class(
75 utf32_t ch,
76 struct WordBreakProperties *wbp,
77 size_t len)
78{
79 int min = 0;
80 int max = len - 1;
81 int mid;
82
83 do
84 {
85 mid = (min + max) / 2;
86
87 if (ch < wbp[mid].start)
88 max = mid - 1;
89 else if (ch > wbp[mid].end)
90 min = mid + 1;
91 else
92 return wbp[mid].prop;
93 }
94 while (min <= max);
95
96 return WBP_Any;
97}
98
99/**
100 * Sets the break types in brks starting from posLast up to posStop.
101 *
102 * It sets the inside chars to #WORDBREAK_INSIDECHAR and the rest to brkType.
103 * Assumes brks is initialized - all the cells with #WORDBREAK_NOBREAK are
104 * cells that we really don't want to break after.
105 *
106 * @param s the string
107 * @param brks[out] the breaks array to fill.
108 * @param posStart the start position
109 * @param posEnd the end position
110 * @param len the length of the string
111 * @param brkType the breaks type to use
112 * @param get_next_char function to get the next UTF-32 character
113 */
114static void set_brks_to(const void *s,
115 char *brks,
116 size_t posStart,
117 size_t posEnd,
118 size_t len,
119 char brkType,
120 get_next_char_t get_next_char)
121{
122 size_t posCur = posStart;
123 while (posCur < posEnd)
124 {
125 get_next_char(s, len, &posCur);
126 for ( ; posStart < posCur - 1; ++posStart)
127 {
128 brks[posStart] = WORDBREAK_INSIDECHAR;
129 }
130 assert(posStart == posCur - 1);
131
132 /* Only set it if we haven't set it not to break before. */
133 if (brks[posStart] != WORDBREAK_NOBREAK)
134 brks[posStart] = brkType;
135 posStart = posCur;
136 }
137}
138
139/* Checks to see if newline, cr, or lf. for WB3a and b */
140#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
141 (cls == WBP_LF))
142
143/**
144 * Sets the word breaking information for a generic input string.
145 *
146 * @param[in] s input string
147 * @param[in] len length of the input
148 * @param[in] lang language of the input
149 * @param[out] brks pointer to the output breaking data, containing
150 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
151 * #WORDBREAK_INSIDEACHAR
152 * @param[in] get_next_char function to get the next UTF-32 character
153 */
154static void set_wordbreaks(
155 const void *s,
156 size_t len,
157 const char *lang,
158 char *brks,
159 get_next_char_t get_next_char)
160{
161 /* Previous class */
162 enum WordBreakClass p_cls = WBP_Undefined;
163 /* Strong previous class. */
164 enum WordBreakClass sp_cls = WBP_Undefined;
165 utf32_t ch;
166 size_t posCur = 0;
167 size_t posCurSt = 0;
168 size_t posLast = 0;
169
170 /* FIXME: unused atm. */
171 (void) lang;
172
173
174 /* Init brks */
175 memset(brks, WORDBREAK_BREAK, len);
176
177 ch = get_next_char(s, len, &posCur);
178
179 /* WB3a, WB3b are implied. */
180 for ( ; ch != EOS ; )
181 {
182 /* Current class */
183 enum WordBreakClass c_cls;
184 c_cls = get_char_wb_class(ch, wb_prop_default,
185 ARRAY_LEN(wb_prop_default));
186
187 switch (c_cls)
188 {
189 case WBP_CR:
190 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
191 get_next_char);
192 sp_cls = c_cls;
193 posLast = posCurSt;
194 break;
195
196 case WBP_LF:
197 if (sp_cls == WBP_CR) /* WB3 */
198 {
199 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
200 get_next_char);
201 sp_cls = c_cls;
202 posLast = posCurSt;
203 }
204 sp_cls = c_cls;
205 posLast = posCurSt;
206 break;
207
208 case WBP_Newline:
209 /* WB3a, WB3b */
210 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
211 get_next_char);
212 sp_cls = c_cls;
213 posLast = posCurSt;
214 break;
215
216 case WBP_Extend:
217 case WBP_Format:
218 /* WB4 - If not the first char/after a newline (W3ab),
219 * skip this class, set it to be the same as the prev, and mark
220 * brks not to break before them. */
221 if ((sp_cls == WBP_Undefined) || IS_WB3ab(sp_cls))
222 {
223 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
224 get_next_char);
225 sp_cls = c_cls;
226 }
227 else
228 {
229 /* It's surely not the first */
230 brks[posCurSt - 1] = WORDBREAK_NOBREAK;
231 /* "inherit" the previous class. */
232 c_cls = p_cls;
233 }
234 break;
235
236 case WBP_Katakana:
237 if ((sp_cls == WBP_Katakana) || /* WB13 */
238 (sp_cls == WBP_ExtendNumLet)) /* WB13b */
239 {
240 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
241 get_next_char);
242 }
243 /* No rule found, reset */
244 else
245 {
246 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
247 get_next_char);
248 }
249 sp_cls = c_cls;
250 posLast = posCurSt;
251 break;
252
253 case WBP_ALetter:
254 if ((sp_cls == WBP_ALetter) || /* WB5,6,7 */
255 ((sp_cls == WBP_Numeric) && (p_cls == WBP_Numeric)) || /* WB10 */
256 (sp_cls == WBP_ExtendNumLet)) /* WB13b */
257 {
258 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
259 get_next_char);
260 }
261 /* No rule found, reset */
262 else
263 {
264 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
265 get_next_char);
266 }
267 sp_cls = c_cls;
268 posLast = posCurSt;
269 break;
270
271 case WBP_MidNumLet:
272 if ((p_cls == WBP_ALetter) || /* WBP6,7 */
273 (p_cls == WBP_Numeric)) /* WBP11,12 */
274 {
275 /* Go on */
276 }
277 else
278 {
279 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
280 get_next_char);
281 sp_cls = c_cls;
282 posLast = posCurSt;
283 }
284 break;
285
286 case WBP_MidLetter:
287 if (p_cls == WBP_ALetter) /* WBP6,7 */
288 {
289 /* Go on */
290 }
291 else
292 {
293 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
294 get_next_char);
295 sp_cls = c_cls;
296 posLast = posCurSt;
297 }
298 break;
299
300 case WBP_MidNum:
301 if (p_cls == WBP_Numeric) /* WBP11,12 */
302 {
303 /* Go on */
304 }
305 else
306 {
307 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
308 get_next_char);
309 sp_cls = c_cls;
310 posLast = posCurSt;
311 }
312 break;
313
314 case WBP_Numeric:
315 if ((sp_cls == WBP_Numeric) || /* WB8,11,12 */
316 ((sp_cls == WBP_ALetter) && (p_cls == WBP_ALetter)) || /* WB9 */
317 (sp_cls == WBP_ExtendNumLet)) /* WB13b */
318 {
319 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
320 get_next_char);
321 }
322 /* No rule found, reset */
323 else
324 {
325 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
326 get_next_char);
327 }
328 sp_cls = c_cls;
329 posLast = posCurSt;
330 break;
331
332 case WBP_ExtendNumLet:
333 /* WB13a,13b */
334 if ((sp_cls == p_cls) &&
335 ((p_cls == WBP_ALetter) ||
336 (p_cls == WBP_Numeric) ||
337 (p_cls == WBP_Katakana) ||
338 (p_cls == WBP_ExtendNumLet)))
339 {
340 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
341 get_next_char);
342 }
343 /* No rule found, reset */
344 else
345 {
346 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
347 get_next_char);
348 }
349 sp_cls = c_cls;
350 posLast = posCurSt;
351 break;
352
353 case WBP_Any:
354 /* Allow breaks and reset */
355 set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
356 get_next_char);
357 sp_cls = c_cls;
358 posLast = posCurSt;
359 break;
360
361 default:
362 /* Error, should never get here! */
363 assert(0);
364 break;
365 }
366
367 p_cls = c_cls;
368 posCurSt = posCur;
369 ch = get_next_char(s, len, &posCur);
370 }
371
372 /* WB2 */
373 set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK,
374 get_next_char);
375}
376
377/**
378 * Sets the word breaking information for a UTF-8 input string.
379 *
380 * @param[in] s input UTF-8 string
381 * @param[in] len length of the input
382 * @param[in] lang language of the input
383 * @param[out] brks pointer to the output breaking data, containing
384 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
385 * #WORDBREAK_INSIDEACHAR
386 */
387void set_wordbreaks_utf8(
388 const utf8_t *s,
389 size_t len,
390 const char *lang,
391 char *brks)
392{
393 set_wordbreaks(s, len, lang, brks,
394 (get_next_char_t)lb_get_next_char_utf8);
395}
396
397/**
398 * Sets the word breaking information for a UTF-16 input string.
399 *
400 * @param[in] s input UTF-16 string
401 * @param[in] len length of the input
402 * @param[in] lang language of the input
403 * @param[out] brks pointer to the output breaking data, containing
404 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
405 * #WORDBREAK_INSIDEACHAR
406 */
407void set_wordbreaks_utf16(
408 const utf16_t *s,
409 size_t len,
410 const char *lang,
411 char *brks)
412{
413 set_wordbreaks(s, len, lang, brks,
414 (get_next_char_t)lb_get_next_char_utf16);
415}
416
417/**
418 * Sets the word breaking information for a UTF-32 input string.
419 *
420 * @param[in] s input UTF-32 string
421 * @param[in] len length of the input
422 * @param[in] lang language of the input
423 * @param[out] brks pointer to the output breaking data, containing
424 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
425 * #WORDBREAK_INSIDEACHAR
426 */
427void set_wordbreaks_utf32(
428 const utf32_t *s,
429 size_t len,
430 const char *lang,
431 char *brks)
432{
433 set_wordbreaks(s, len, lang, brks,
434 (get_next_char_t)lb_get_next_char_utf32);
435}