aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/evas/src/static_deps/liblinebreak/linebreak.c
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/evas/src/static_deps/liblinebreak/linebreak.c')
-rw-r--r--libraries/evas/src/static_deps/liblinebreak/linebreak.c737
1 files changed, 737 insertions, 0 deletions
diff --git a/libraries/evas/src/static_deps/liblinebreak/linebreak.c b/libraries/evas/src/static_deps/liblinebreak/linebreak.c
new file mode 100644
index 0000000..f9ff9a1
--- /dev/null
+++ b/libraries/evas/src/static_deps/liblinebreak/linebreak.c
@@ -0,0 +1,737 @@
1/* vim: set tabstop=4 shiftwidth=4: */
2
3/*
4 * Line breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
6 *
7 * Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
8 *
9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the author be held liable for any damages
11 * arising from the use of this software.
12 *
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute
15 * it freely, subject to the following restrictions:
16 *
17 * 1. The origin of this software must not be misrepresented; you must
18 * not claim that you wrote the original software. If you use this
19 * software in a product, an acknowledgement in the product
20 * documentation would be appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must
22 * not be misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source
24 * distribution.
25 *
26 * The main reference is Unicode Standard Annex 14 (UAX #14):
27 * <URL:http://www.unicode.org/reports/tr14/>
28 *
29 * When this library was designed, this annex was at Revision 19, for
30 * Unicode 5.0.0:
31 * <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
32 *
33 * This library has been updated according to Revision 24, for
34 * Unicode 5.2.0:
35 * <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
36 *
37 * The Unicode Terms of Use are available at
38 * <URL:http://www.unicode.org/copyright.html>
39 */
40
41/**
42 * @file linebreak.c
43 *
44 * Implementation of the line breaking algorithm as described in Unicode
45 * Standard Annex 14.
46 *
47 * @version 2.0, 2010/01/03
48 * @author Wu Yongwei
49 */
50
51#include <assert.h>
52#include <stddef.h>
53#include <string.h>
54#include "linebreak.h"
55#include "linebreakdef.h"
56
57/**
58 * Size of the second-level index to the line breaking properties.
59 */
60#define LINEBREAK_INDEX_SIZE 40
61
62/**
63 * Version number of the library.
64 */
65const int linebreak_version = LINEBREAK_VERSION;
66
67/**
68 * Enumeration of break actions. They are used in the break action
69 * pair table below.
70 */
71enum BreakAction
72{
73 DIR_BRK, /**< Direct break opportunity */
74 IND_BRK, /**< Indirect break opportunity */
75 CMI_BRK, /**< Indirect break opportunity for combining marks */
76 CMP_BRK, /**< Prohibited break for combining marks */
77 PRH_BRK /**< Prohibited break */
78};
79
80/**
81 * Break action pair table. This is a direct mapping of Table 2 of
82 * Unicode Standard Annex 14, Revision 24.
83 */
84static enum BreakAction baTable[LBP_JT][LBP_JT] = {
85 { /* OP */
86 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
87 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
88 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK,
89 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
90 { /* CL */
91 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
92 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
93 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
94 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
95 { /* CP */
96 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
97 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
98 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
99 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
100 { /* QU */
101 PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
102 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
103 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
104 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
105 { /* GL */
106 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
107 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
108 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
109 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
110 { /* NS */
111 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
112 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
113 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
114 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
115 { /* EX */
116 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
117 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
118 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
119 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
120 { /* SY */
121 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
122 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
123 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
124 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
125 { /* IS */
126 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
127 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
128 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
129 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
130 { /* PR */
131 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
132 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
133 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
134 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
135 { /* PO */
136 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
137 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
138 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
139 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
140 { /* NU */
141 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
142 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
143 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
144 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
145 { /* AL */
146 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
147 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
148 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
149 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
150 { /* ID */
151 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
152 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
153 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
154 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
155 { /* IN */
156 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
157 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
158 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
159 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
160 { /* HY */
161 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
162 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
163 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
164 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
165 { /* BA */
166 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
167 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
168 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
169 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
170 { /* BB */
171 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
172 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
173 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
174 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
175 { /* B2 */
176 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
177 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
178 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK,
179 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
180 { /* ZW */
181 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
182 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
183 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK,
184 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
185 { /* CM */
186 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
187 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
188 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
189 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
190 { /* WJ */
191 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
192 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
193 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
194 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
195 { /* H2 */
196 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
197 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
198 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
199 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
200 { /* H3 */
201 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
202 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
203 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
204 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK },
205 { /* JL */
206 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
207 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
208 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
209 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
210 { /* JV */
211 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
212 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
213 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
214 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
215 { /* JT */
216 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
217 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
218 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
219 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }
220};
221
222/**
223 * Struct for the second-level index to the line breaking properties.
224 */
225struct LineBreakPropertiesIndex
226{
227 utf32_t end; /**< End coding point */
228 struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
229};
230
231/**
232 * Second-level index to the line breaking properties.
233 */
234static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
235{
236 { 0xFFFFFFFF, lb_prop_default }
237};
238
239/**
240 * Initializes the second-level index to the line breaking properties.
241 * If it is not called, the performance of #get_char_lb_class_lang (and
242 * thus the main functionality) can be pretty bad, especially for big
243 * code points like those of Chinese.
244 */
245void init_linebreak(void)
246{
247 size_t i;
248 size_t iPropDefault;
249 size_t len;
250 size_t step;
251
252 len = 0;
253 while (lb_prop_default[len].prop != LBP_Undefined)
254 ++len;
255 step = len / LINEBREAK_INDEX_SIZE;
256 iPropDefault = 0;
257 for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
258 {
259 lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
260 iPropDefault += step;
261 lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
262 }
263 lb_prop_index[--i].end = 0xFFFFFFFF;
264}
265
266/**
267 * Gets the language-specific line breaking properties.
268 *
269 * @param lang language of the text
270 * @return pointer to the language-specific line breaking
271 * properties array if found; \c NULL otherwise
272 */
273static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
274{
275 struct LineBreakPropertiesLang *lbplIter;
276 if (lang != NULL)
277 {
278 for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
279 {
280 if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
281 {
282 return lbplIter->lbp;
283 }
284 }
285 }
286 return NULL;
287}
288
289/**
290 * Gets the line breaking class of a character from a line breaking
291 * properties array.
292 *
293 * @param ch character to check
294 * @param lbp pointer to the line breaking properties array
295 * @return the line breaking class if found; \c LBP_XX otherwise
296 */
297static enum LineBreakClass get_char_lb_class(
298 utf32_t ch,
299 struct LineBreakProperties *lbp)
300{
301 while (lbp->prop != LBP_Undefined && ch >= lbp->start)
302 {
303 if (ch <= lbp->end)
304 return lbp->prop;
305 ++lbp;
306 }
307 return LBP_XX;
308}
309
310/**
311 * Gets the line breaking class of a character from the default line
312 * breaking properties array.
313 *
314 * @param ch character to check
315 * @return the line breaking class if found; \c LBP_XX otherwise
316 */
317static enum LineBreakClass get_char_lb_class_default(
318 utf32_t ch)
319{
320 size_t i = 0;
321 while (ch > lb_prop_index[i].end)
322 ++i;
323 assert(i < LINEBREAK_INDEX_SIZE);
324 return get_char_lb_class(ch, lb_prop_index[i].lbp);
325}
326
327/**
328 * Gets the line breaking class of a character for a specific
329 * language. This function will check the language-specific data first,
330 * and then the default data if there is no language-specific property
331 * available for the character.
332 *
333 * @param ch character to check
334 * @param lbpLang pointer to the language-specific line breaking
335 * properties array
336 * @return the line breaking class if found; \c LBP_XX
337 * otherwise
338 */
339static enum LineBreakClass get_char_lb_class_lang(
340 utf32_t ch,
341 struct LineBreakProperties *lbpLang)
342{
343 enum LineBreakClass lbcResult;
344
345 /* Find the language-specific line breaking class for a character */
346 if (lbpLang)
347 {
348 lbcResult = get_char_lb_class(ch, lbpLang);
349 if (lbcResult != LBP_XX)
350 return lbcResult;
351 }
352
353 /* Find the generic language-specific line breaking class, if no
354 * language context is provided, or language-specific data are not
355 * available for the specific character in the specified language */
356 return get_char_lb_class_default(ch);
357}
358
359/**
360 * Resolves the line breaking class for certain ambiguous or complicated
361 * characters. They are treated in a simplistic way in this
362 * implementation.
363 *
364 * @param lbc line breaking class to resolve
365 * @param lang language of the text
366 * @return the resolved line breaking class
367 */
368static enum LineBreakClass resolve_lb_class(
369 enum LineBreakClass lbc,
370 const char *lang)
371{
372 switch (lbc)
373 {
374 case LBP_AI:
375 if (lang != NULL &&
376 (strncmp(lang, "zh", 2) == 0 || /* Chinese */
377 strncmp(lang, "ja", 2) == 0 || /* Japanese */
378 strncmp(lang, "ko", 2) == 0)) /* Korean */
379 {
380 return LBP_ID;
381 }
382 /* Fall through */
383 case LBP_SA:
384 case LBP_SG:
385 case LBP_XX:
386 return LBP_AL;
387 default:
388 return lbc;
389 }
390}
391
392/**
393 * Gets the next Unicode character in a UTF-8 sequence. The index will
394 * be advanced to the next complete character, unless the end of string
395 * is reached in the middle of a UTF-8 sequence.
396 *
397 * @param[in] s input UTF-8 string
398 * @param[in] len length of the string in bytes
399 * @param[in,out] ip pointer to the index
400 * @return the Unicode character beginning at the index; or
401 * #EOS if end of input is encountered
402 */
403utf32_t lb_get_next_char_utf8(
404 const utf8_t *s,
405 size_t len,
406 size_t *ip)
407{
408 utf8_t ch;
409 utf32_t res;
410
411 assert(*ip <= len);
412 if (*ip == len)
413 return EOS;
414 ch = s[*ip];
415
416 if (ch < 0xC2 || ch > 0xF4)
417 { /* One-byte sequence, tail (should not occur), or invalid */
418 *ip += 1;
419 return ch;
420 }
421 else if (ch < 0xE0)
422 { /* Two-byte sequence */
423 if (*ip + 2 > len)
424 return EOS;
425 res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
426 *ip += 2;
427 return res;
428 }
429 else if (ch < 0xF0)
430 { /* Three-byte sequence */
431 if (*ip + 3 > len)
432 return EOS;
433 res = ((ch & 0x0F) << 12) +
434 ((s[*ip + 1] & 0x3F) << 6) +
435 ((s[*ip + 2] & 0x3F));
436 *ip += 3;
437 return res;
438 }
439 else
440 { /* Four-byte sequence */
441 if (*ip + 4 > len)
442 return EOS;
443 res = ((ch & 0x07) << 18) +
444 ((s[*ip + 1] & 0x3F) << 12) +
445 ((s[*ip + 2] & 0x3F) << 6) +
446 ((s[*ip + 3] & 0x3F));
447 *ip += 4;
448 return res;
449 }
450}
451
452/**
453 * Gets the next Unicode character in a UTF-16 sequence. The index will
454 * be advanced to the next complete character, unless the end of string
455 * is reached in the middle of a UTF-16 surrogate pair.
456 *
457 * @param[in] s input UTF-16 string
458 * @param[in] len length of the string in words
459 * @param[in,out] ip pointer to the index
460 * @return the Unicode character beginning at the index; or
461 * #EOS if end of input is encountered
462 */
463utf32_t lb_get_next_char_utf16(
464 const utf16_t *s,
465 size_t len,
466 size_t *ip)
467{
468 utf16_t ch;
469
470 assert(*ip <= len);
471 if (*ip == len)
472 return EOS;
473 ch = s[(*ip)++];
474
475 if (ch < 0xD800 || ch > 0xDBFF)
476 { /* If the character is not a high surrogate */
477 return ch;
478 }
479 if (*ip == len)
480 { /* If the input ends here (an error) */
481 --(*ip);
482 return EOS;
483 }
484 if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
485 { /* If the next character is not the low surrogate (an error) */
486 return ch;
487 }
488 /* Return the constructed character and advance the index again */
489 return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
490}
491
492/**
493 * Gets the next Unicode character in a UTF-32 sequence. The index will
494 * be advanced to the next character.
495 *
496 * @param[in] s input UTF-32 string
497 * @param[in] len length of the string in dwords
498 * @param[in,out] ip pointer to the index
499 * @return the Unicode character beginning at the index; or
500 * #EOS if end of input is encountered
501 */
502utf32_t lb_get_next_char_utf32(
503 const utf32_t *s,
504 size_t len,
505 size_t *ip)
506{
507 assert(*ip <= len);
508 if (*ip == len)
509 return EOS;
510 return s[(*ip)++];
511}
512
513/**
514 * Sets the line breaking information for a generic input string.
515 *
516 * @param[in] s input string
517 * @param[in] len length of the input
518 * @param[in] lang language of the input
519 * @param[out] brks pointer to the output breaking data,
520 * containing #LINEBREAK_MUSTBREAK,
521 * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
522 * or #LINEBREAK_INSIDEACHAR
523 * @param[in] get_next_char function to get the next UTF-32 character
524 */
525void set_linebreaks(
526 const void *s,
527 size_t len,
528 const char *lang,
529 char *brks,
530 get_next_char_t get_next_char)
531{
532 utf32_t ch;
533 enum LineBreakClass lbcCur;
534 enum LineBreakClass lbcNew;
535 enum LineBreakClass lbcLast;
536 struct LineBreakProperties *lbpLang;
537 size_t posCur = 0;
538 size_t posLast = 0;
539
540 --posLast; /* To be ++'d later */
541 ch = get_next_char(s, len, &posCur);
542 if (ch == EOS)
543 return;
544 lbpLang = get_lb_prop_lang(lang);
545 lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
546 lbcNew = LBP_Undefined;
547
548nextline:
549
550 /* Special treatment for the first character */
551 switch (lbcCur)
552 {
553 case LBP_LF:
554 case LBP_NL:
555 lbcCur = LBP_BK;
556 break;
557 case LBP_CB:
558 lbcCur = LBP_BA;
559 break;
560 case LBP_SP:
561 lbcCur = LBP_WJ;
562 break;
563 default:
564 break;
565 }
566
567 /* Process a line till an explicit break or end of string */
568 for (;;)
569 {
570 for (++posLast; posLast < posCur - 1; ++posLast)
571 {
572 brks[posLast] = LINEBREAK_INSIDEACHAR;
573 }
574 assert(posLast == posCur - 1);
575 lbcLast = lbcNew;
576 ch = get_next_char(s, len, &posCur);
577 if (ch == EOS)
578 break;
579 lbcNew = get_char_lb_class_lang(ch, lbpLang);
580 if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF))
581 {
582 brks[posLast] = LINEBREAK_MUSTBREAK;
583 lbcCur = resolve_lb_class(lbcNew, lang);
584 goto nextline;
585 }
586
587 switch (lbcNew)
588 {
589 case LBP_SP:
590 brks[posLast] = LINEBREAK_NOBREAK;
591 continue;
592 case LBP_BK:
593 case LBP_LF:
594 case LBP_NL:
595 brks[posLast] = LINEBREAK_NOBREAK;
596 lbcCur = LBP_BK;
597 continue;
598 case LBP_CR:
599 brks[posLast] = LINEBREAK_NOBREAK;
600 lbcCur = LBP_CR;
601 continue;
602 case LBP_CB:
603 brks[posLast] = LINEBREAK_ALLOWBREAK;
604 lbcCur = LBP_BA;
605 continue;
606 default:
607 break;
608 }
609
610 lbcNew = resolve_lb_class(lbcNew, lang);
611
612 assert(lbcCur <= LBP_JT);
613 assert(lbcNew <= LBP_JT);
614 switch (baTable[lbcCur - 1][lbcNew - 1])
615 {
616 case DIR_BRK:
617 brks[posLast] = LINEBREAK_ALLOWBREAK;
618 break;
619 case CMI_BRK:
620 case IND_BRK:
621 if (lbcLast == LBP_SP)
622 {
623 brks[posLast] = LINEBREAK_ALLOWBREAK;
624 }
625 else
626 {
627 brks[posLast] = LINEBREAK_NOBREAK;
628 }
629 break;
630 case CMP_BRK:
631 brks[posLast] = LINEBREAK_NOBREAK;
632 if (lbcLast != LBP_SP)
633 continue;
634 break;
635 case PRH_BRK:
636 brks[posLast] = LINEBREAK_NOBREAK;
637 break;
638 }
639
640 lbcCur = lbcNew;
641 }
642
643 assert(posLast == posCur - 1 && posCur <= len);
644 /* Break after the last character */
645 brks[posLast] = LINEBREAK_MUSTBREAK;
646 /* When the input contains incomplete sequences */
647 while (posCur < len)
648 {
649 brks[posCur++] = LINEBREAK_INSIDEACHAR;
650 }
651}
652
653/**
654 * Sets the line breaking information for a UTF-8 input string.
655 *
656 * @param[in] s input UTF-8 string
657 * @param[in] len length of the input
658 * @param[in] lang language of the input
659 * @param[out] brks pointer to the output breaking data, containing
660 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
661 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
662 */
663void set_linebreaks_utf8(
664 const utf8_t *s,
665 size_t len,
666 const char *lang,
667 char *brks)
668{
669 set_linebreaks(s, len, lang, brks,
670 (get_next_char_t)lb_get_next_char_utf8);
671}
672
673/**
674 * Sets the line breaking information for a UTF-16 input string.
675 *
676 * @param[in] s input UTF-16 string
677 * @param[in] len length of the input
678 * @param[in] lang language of the input
679 * @param[out] brks pointer to the output breaking data, containing
680 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
681 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
682 */
683void set_linebreaks_utf16(
684 const utf16_t *s,
685 size_t len,
686 const char *lang,
687 char *brks)
688{
689 set_linebreaks(s, len, lang, brks,
690 (get_next_char_t)lb_get_next_char_utf16);
691}
692
693/**
694 * Sets the line breaking information for a UTF-32 input string.
695 *
696 * @param[in] s input UTF-32 string
697 * @param[in] len length of the input
698 * @param[in] lang language of the input
699 * @param[out] brks pointer to the output breaking data, containing
700 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
701 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
702 */
703void set_linebreaks_utf32(
704 const utf32_t *s,
705 size_t len,
706 const char *lang,
707 char *brks)
708{
709 set_linebreaks(s, len, lang, brks,
710 (get_next_char_t)lb_get_next_char_utf32);
711}
712
713/**
714 * Tells whether a line break can occur between two Unicode characters.
715 * This is a wrapper function to expose a simple interface. Generally
716 * speaking, it is better to use #set_linebreaks_utf32 instead, since
717 * complicated cases involving combining marks, spaces, etc. cannot be
718 * correctly processed.
719 *
720 * @param char1 the first Unicode character
721 * @param char2 the second Unicode character
722 * @param lang language of the input
723 * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
724 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
725 */
726int is_line_breakable(
727 utf32_t char1,
728 utf32_t char2,
729 const char* lang)
730{
731 utf32_t s[2];
732 char brks[2];
733 s[0] = char1;
734 s[1] = char2;
735 set_linebreaks_utf32(s, 2, lang, brks);
736 return brks[0];
737}