diff options
author | David Walter Seikel | 2012-01-04 18:41:13 +1000 |
---|---|---|
committer | David Walter Seikel | 2012-01-04 18:41:13 +1000 |
commit | dd7595a3475407a7fa96a97393bae8c5220e8762 (patch) | |
tree | e341e911d7eb911a51684a7412ef7f7c7605d28e /libraries/evas/src/static_deps/liblinebreak/linebreak.c | |
parent | Add the skeleton. (diff) | |
download | SledjHamr-dd7595a3475407a7fa96a97393bae8c5220e8762.zip SledjHamr-dd7595a3475407a7fa96a97393bae8c5220e8762.tar.gz SledjHamr-dd7595a3475407a7fa96a97393bae8c5220e8762.tar.bz2 SledjHamr-dd7595a3475407a7fa96a97393bae8c5220e8762.tar.xz |
Add the base Enlightenment Foundation Libraries - eina, eet, evas, ecore, embryo, and edje.
Note that embryo wont be used, but I'm not sure yet if you can build edje without it.
Diffstat (limited to '')
-rw-r--r-- | libraries/evas/src/static_deps/liblinebreak/linebreak.c | 737 |
1 files changed, 737 insertions, 0 deletions
diff --git a/libraries/evas/src/static_deps/liblinebreak/linebreak.c b/libraries/evas/src/static_deps/liblinebreak/linebreak.c new file mode 100644 index 0000000..f9ff9a1 --- /dev/null +++ b/libraries/evas/src/static_deps/liblinebreak/linebreak.c | |||
@@ -0,0 +1,737 @@ | |||
1 | /* vim: set tabstop=4 shiftwidth=4: */ | ||
2 | |||
3 | /* | ||
4 | * Line breaking in a Unicode sequence. Designed to be used in a | ||
5 | * generic text renderer. | ||
6 | * | ||
7 | * Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com> | ||
8 | * | ||
9 | * This software is provided 'as-is', without any express or implied | ||
10 | * warranty. In no event will the author be held liable for any damages | ||
11 | * arising from the use of this software. | ||
12 | * | ||
13 | * Permission is granted to anyone to use this software for any purpose, | ||
14 | * including commercial applications, and to alter it and redistribute | ||
15 | * it freely, subject to the following restrictions: | ||
16 | * | ||
17 | * 1. The origin of this software must not be misrepresented; you must | ||
18 | * not claim that you wrote the original software. If you use this | ||
19 | * software in a product, an acknowledgement in the product | ||
20 | * documentation would be appreciated but is not required. | ||
21 | * 2. Altered source versions must be plainly marked as such, and must | ||
22 | * not be misrepresented as being the original software. | ||
23 | * 3. This notice may not be removed or altered from any source | ||
24 | * distribution. | ||
25 | * | ||
26 | * The main reference is Unicode Standard Annex 14 (UAX #14): | ||
27 | * <URL:http://www.unicode.org/reports/tr14/> | ||
28 | * | ||
29 | * When this library was designed, this annex was at Revision 19, for | ||
30 | * Unicode 5.0.0: | ||
31 | * <URL:http://www.unicode.org/reports/tr14/tr14-19.html> | ||
32 | * | ||
33 | * This library has been updated according to Revision 24, for | ||
34 | * Unicode 5.2.0: | ||
35 | * <URL:http://www.unicode.org/reports/tr14/tr14-24.html> | ||
36 | * | ||
37 | * The Unicode Terms of Use are available at | ||
38 | * <URL:http://www.unicode.org/copyright.html> | ||
39 | */ | ||
40 | |||
41 | /** | ||
42 | * @file linebreak.c | ||
43 | * | ||
44 | * Implementation of the line breaking algorithm as described in Unicode | ||
45 | * Standard Annex 14. | ||
46 | * | ||
47 | * @version 2.0, 2010/01/03 | ||
48 | * @author Wu Yongwei | ||
49 | */ | ||
50 | |||
51 | #include <assert.h> | ||
52 | #include <stddef.h> | ||
53 | #include <string.h> | ||
54 | #include "linebreak.h" | ||
55 | #include "linebreakdef.h" | ||
56 | |||
57 | /** | ||
58 | * Size of the second-level index to the line breaking properties. | ||
59 | */ | ||
60 | #define LINEBREAK_INDEX_SIZE 40 | ||
61 | |||
62 | /** | ||
63 | * Version number of the library. | ||
64 | */ | ||
65 | const int linebreak_version = LINEBREAK_VERSION; | ||
66 | |||
67 | /** | ||
68 | * Enumeration of break actions. They are used in the break action | ||
69 | * pair table below. | ||
70 | */ | ||
71 | enum BreakAction | ||
72 | { | ||
73 | DIR_BRK, /**< Direct break opportunity */ | ||
74 | IND_BRK, /**< Indirect break opportunity */ | ||
75 | CMI_BRK, /**< Indirect break opportunity for combining marks */ | ||
76 | CMP_BRK, /**< Prohibited break for combining marks */ | ||
77 | PRH_BRK /**< Prohibited break */ | ||
78 | }; | ||
79 | |||
80 | /** | ||
81 | * Break action pair table. This is a direct mapping of Table 2 of | ||
82 | * Unicode Standard Annex 14, Revision 24. | ||
83 | */ | ||
84 | static enum BreakAction baTable[LBP_JT][LBP_JT] = { | ||
85 | { /* OP */ | ||
86 | PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, | ||
87 | PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, | ||
88 | PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK, | ||
89 | PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK }, | ||
90 | { /* CL */ | ||
91 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK, | ||
92 | PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
93 | DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
94 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
95 | { /* CP */ | ||
96 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK, | ||
97 | PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, | ||
98 | DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
99 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
100 | { /* QU */ | ||
101 | PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
102 | PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, | ||
103 | IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK, | ||
104 | PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK }, | ||
105 | { /* GL */ | ||
106 | IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
107 | PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, | ||
108 | IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK, | ||
109 | PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK }, | ||
110 | { /* NS */ | ||
111 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
112 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
113 | DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
114 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
115 | { /* EX */ | ||
116 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
117 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
118 | DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
119 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
120 | { /* SY */ | ||
121 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
122 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, | ||
123 | DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
124 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
125 | { /* IS */ | ||
126 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
127 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, | ||
128 | DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
129 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
130 | { /* PR */ | ||
131 | IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
132 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, | ||
133 | DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
134 | PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK }, | ||
135 | { /* PO */ | ||
136 | IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
137 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, | ||
138 | DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
139 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
140 | { /* NU */ | ||
141 | IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
142 | PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, | ||
143 | IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
144 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
145 | { /* AL */ | ||
146 | IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
147 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, | ||
148 | IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
149 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
150 | { /* ID */ | ||
151 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
152 | PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
153 | IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
154 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
155 | { /* IN */ | ||
156 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
157 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
158 | IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
159 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
160 | { /* HY */ | ||
161 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK, | ||
162 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, | ||
163 | DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
164 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
165 | { /* BA */ | ||
166 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK, | ||
167 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
168 | DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
169 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
170 | { /* BB */ | ||
171 | IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
172 | PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, | ||
173 | IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK, | ||
174 | PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK }, | ||
175 | { /* B2 */ | ||
176 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
177 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
178 | DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK, | ||
179 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
180 | { /* ZW */ | ||
181 | DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
182 | DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
183 | DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK, | ||
184 | DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
185 | { /* CM */ | ||
186 | IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
187 | PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, | ||
188 | IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
189 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, | ||
190 | { /* WJ */ | ||
191 | IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
192 | PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, | ||
193 | IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK, | ||
194 | PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK }, | ||
195 | { /* H2 */ | ||
196 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
197 | PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
198 | IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
199 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK }, | ||
200 | { /* H3 */ | ||
201 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
202 | PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
203 | IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
204 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }, | ||
205 | { /* JL */ | ||
206 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
207 | PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
208 | IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
209 | PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK }, | ||
210 | { /* JV */ | ||
211 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
212 | PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
213 | IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
214 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK }, | ||
215 | { /* JT */ | ||
216 | DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, | ||
217 | PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, | ||
218 | IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, | ||
219 | PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK } | ||
220 | }; | ||
221 | |||
222 | /** | ||
223 | * Struct for the second-level index to the line breaking properties. | ||
224 | */ | ||
225 | struct LineBreakPropertiesIndex | ||
226 | { | ||
227 | utf32_t end; /**< End coding point */ | ||
228 | struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */ | ||
229 | }; | ||
230 | |||
231 | /** | ||
232 | * Second-level index to the line breaking properties. | ||
233 | */ | ||
234 | static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] = | ||
235 | { | ||
236 | { 0xFFFFFFFF, lb_prop_default } | ||
237 | }; | ||
238 | |||
239 | /** | ||
240 | * Initializes the second-level index to the line breaking properties. | ||
241 | * If it is not called, the performance of #get_char_lb_class_lang (and | ||
242 | * thus the main functionality) can be pretty bad, especially for big | ||
243 | * code points like those of Chinese. | ||
244 | */ | ||
245 | void init_linebreak(void) | ||
246 | { | ||
247 | size_t i; | ||
248 | size_t iPropDefault; | ||
249 | size_t len; | ||
250 | size_t step; | ||
251 | |||
252 | len = 0; | ||
253 | while (lb_prop_default[len].prop != LBP_Undefined) | ||
254 | ++len; | ||
255 | step = len / LINEBREAK_INDEX_SIZE; | ||
256 | iPropDefault = 0; | ||
257 | for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i) | ||
258 | { | ||
259 | lb_prop_index[i].lbp = lb_prop_default + iPropDefault; | ||
260 | iPropDefault += step; | ||
261 | lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1; | ||
262 | } | ||
263 | lb_prop_index[--i].end = 0xFFFFFFFF; | ||
264 | } | ||
265 | |||
266 | /** | ||
267 | * Gets the language-specific line breaking properties. | ||
268 | * | ||
269 | * @param lang language of the text | ||
270 | * @return pointer to the language-specific line breaking | ||
271 | * properties array if found; \c NULL otherwise | ||
272 | */ | ||
273 | static struct LineBreakProperties *get_lb_prop_lang(const char *lang) | ||
274 | { | ||
275 | struct LineBreakPropertiesLang *lbplIter; | ||
276 | if (lang != NULL) | ||
277 | { | ||
278 | for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter) | ||
279 | { | ||
280 | if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0) | ||
281 | { | ||
282 | return lbplIter->lbp; | ||
283 | } | ||
284 | } | ||
285 | } | ||
286 | return NULL; | ||
287 | } | ||
288 | |||
289 | /** | ||
290 | * Gets the line breaking class of a character from a line breaking | ||
291 | * properties array. | ||
292 | * | ||
293 | * @param ch character to check | ||
294 | * @param lbp pointer to the line breaking properties array | ||
295 | * @return the line breaking class if found; \c LBP_XX otherwise | ||
296 | */ | ||
297 | static enum LineBreakClass get_char_lb_class( | ||
298 | utf32_t ch, | ||
299 | struct LineBreakProperties *lbp) | ||
300 | { | ||
301 | while (lbp->prop != LBP_Undefined && ch >= lbp->start) | ||
302 | { | ||
303 | if (ch <= lbp->end) | ||
304 | return lbp->prop; | ||
305 | ++lbp; | ||
306 | } | ||
307 | return LBP_XX; | ||
308 | } | ||
309 | |||
310 | /** | ||
311 | * Gets the line breaking class of a character from the default line | ||
312 | * breaking properties array. | ||
313 | * | ||
314 | * @param ch character to check | ||
315 | * @return the line breaking class if found; \c LBP_XX otherwise | ||
316 | */ | ||
317 | static enum LineBreakClass get_char_lb_class_default( | ||
318 | utf32_t ch) | ||
319 | { | ||
320 | size_t i = 0; | ||
321 | while (ch > lb_prop_index[i].end) | ||
322 | ++i; | ||
323 | assert(i < LINEBREAK_INDEX_SIZE); | ||
324 | return get_char_lb_class(ch, lb_prop_index[i].lbp); | ||
325 | } | ||
326 | |||
327 | /** | ||
328 | * Gets the line breaking class of a character for a specific | ||
329 | * language. This function will check the language-specific data first, | ||
330 | * and then the default data if there is no language-specific property | ||
331 | * available for the character. | ||
332 | * | ||
333 | * @param ch character to check | ||
334 | * @param lbpLang pointer to the language-specific line breaking | ||
335 | * properties array | ||
336 | * @return the line breaking class if found; \c LBP_XX | ||
337 | * otherwise | ||
338 | */ | ||
339 | static enum LineBreakClass get_char_lb_class_lang( | ||
340 | utf32_t ch, | ||
341 | struct LineBreakProperties *lbpLang) | ||
342 | { | ||
343 | enum LineBreakClass lbcResult; | ||
344 | |||
345 | /* Find the language-specific line breaking class for a character */ | ||
346 | if (lbpLang) | ||
347 | { | ||
348 | lbcResult = get_char_lb_class(ch, lbpLang); | ||
349 | if (lbcResult != LBP_XX) | ||
350 | return lbcResult; | ||
351 | } | ||
352 | |||
353 | /* Find the generic language-specific line breaking class, if no | ||
354 | * language context is provided, or language-specific data are not | ||
355 | * available for the specific character in the specified language */ | ||
356 | return get_char_lb_class_default(ch); | ||
357 | } | ||
358 | |||
359 | /** | ||
360 | * Resolves the line breaking class for certain ambiguous or complicated | ||
361 | * characters. They are treated in a simplistic way in this | ||
362 | * implementation. | ||
363 | * | ||
364 | * @param lbc line breaking class to resolve | ||
365 | * @param lang language of the text | ||
366 | * @return the resolved line breaking class | ||
367 | */ | ||
368 | static enum LineBreakClass resolve_lb_class( | ||
369 | enum LineBreakClass lbc, | ||
370 | const char *lang) | ||
371 | { | ||
372 | switch (lbc) | ||
373 | { | ||
374 | case LBP_AI: | ||
375 | if (lang != NULL && | ||
376 | (strncmp(lang, "zh", 2) == 0 || /* Chinese */ | ||
377 | strncmp(lang, "ja", 2) == 0 || /* Japanese */ | ||
378 | strncmp(lang, "ko", 2) == 0)) /* Korean */ | ||
379 | { | ||
380 | return LBP_ID; | ||
381 | } | ||
382 | /* Fall through */ | ||
383 | case LBP_SA: | ||
384 | case LBP_SG: | ||
385 | case LBP_XX: | ||
386 | return LBP_AL; | ||
387 | default: | ||
388 | return lbc; | ||
389 | } | ||
390 | } | ||
391 | |||
392 | /** | ||
393 | * Gets the next Unicode character in a UTF-8 sequence. The index will | ||
394 | * be advanced to the next complete character, unless the end of string | ||
395 | * is reached in the middle of a UTF-8 sequence. | ||
396 | * | ||
397 | * @param[in] s input UTF-8 string | ||
398 | * @param[in] len length of the string in bytes | ||
399 | * @param[in,out] ip pointer to the index | ||
400 | * @return the Unicode character beginning at the index; or | ||
401 | * #EOS if end of input is encountered | ||
402 | */ | ||
403 | utf32_t lb_get_next_char_utf8( | ||
404 | const utf8_t *s, | ||
405 | size_t len, | ||
406 | size_t *ip) | ||
407 | { | ||
408 | utf8_t ch; | ||
409 | utf32_t res; | ||
410 | |||
411 | assert(*ip <= len); | ||
412 | if (*ip == len) | ||
413 | return EOS; | ||
414 | ch = s[*ip]; | ||
415 | |||
416 | if (ch < 0xC2 || ch > 0xF4) | ||
417 | { /* One-byte sequence, tail (should not occur), or invalid */ | ||
418 | *ip += 1; | ||
419 | return ch; | ||
420 | } | ||
421 | else if (ch < 0xE0) | ||
422 | { /* Two-byte sequence */ | ||
423 | if (*ip + 2 > len) | ||
424 | return EOS; | ||
425 | res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F); | ||
426 | *ip += 2; | ||
427 | return res; | ||
428 | } | ||
429 | else if (ch < 0xF0) | ||
430 | { /* Three-byte sequence */ | ||
431 | if (*ip + 3 > len) | ||
432 | return EOS; | ||
433 | res = ((ch & 0x0F) << 12) + | ||
434 | ((s[*ip + 1] & 0x3F) << 6) + | ||
435 | ((s[*ip + 2] & 0x3F)); | ||
436 | *ip += 3; | ||
437 | return res; | ||
438 | } | ||
439 | else | ||
440 | { /* Four-byte sequence */ | ||
441 | if (*ip + 4 > len) | ||
442 | return EOS; | ||
443 | res = ((ch & 0x07) << 18) + | ||
444 | ((s[*ip + 1] & 0x3F) << 12) + | ||
445 | ((s[*ip + 2] & 0x3F) << 6) + | ||
446 | ((s[*ip + 3] & 0x3F)); | ||
447 | *ip += 4; | ||
448 | return res; | ||
449 | } | ||
450 | } | ||
451 | |||
452 | /** | ||
453 | * Gets the next Unicode character in a UTF-16 sequence. The index will | ||
454 | * be advanced to the next complete character, unless the end of string | ||
455 | * is reached in the middle of a UTF-16 surrogate pair. | ||
456 | * | ||
457 | * @param[in] s input UTF-16 string | ||
458 | * @param[in] len length of the string in words | ||
459 | * @param[in,out] ip pointer to the index | ||
460 | * @return the Unicode character beginning at the index; or | ||
461 | * #EOS if end of input is encountered | ||
462 | */ | ||
463 | utf32_t lb_get_next_char_utf16( | ||
464 | const utf16_t *s, | ||
465 | size_t len, | ||
466 | size_t *ip) | ||
467 | { | ||
468 | utf16_t ch; | ||
469 | |||
470 | assert(*ip <= len); | ||
471 | if (*ip == len) | ||
472 | return EOS; | ||
473 | ch = s[(*ip)++]; | ||
474 | |||
475 | if (ch < 0xD800 || ch > 0xDBFF) | ||
476 | { /* If the character is not a high surrogate */ | ||
477 | return ch; | ||
478 | } | ||
479 | if (*ip == len) | ||
480 | { /* If the input ends here (an error) */ | ||
481 | --(*ip); | ||
482 | return EOS; | ||
483 | } | ||
484 | if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF) | ||
485 | { /* If the next character is not the low surrogate (an error) */ | ||
486 | return ch; | ||
487 | } | ||
488 | /* Return the constructed character and advance the index again */ | ||
489 | return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000; | ||
490 | } | ||
491 | |||
492 | /** | ||
493 | * Gets the next Unicode character in a UTF-32 sequence. The index will | ||
494 | * be advanced to the next character. | ||
495 | * | ||
496 | * @param[in] s input UTF-32 string | ||
497 | * @param[in] len length of the string in dwords | ||
498 | * @param[in,out] ip pointer to the index | ||
499 | * @return the Unicode character beginning at the index; or | ||
500 | * #EOS if end of input is encountered | ||
501 | */ | ||
502 | utf32_t lb_get_next_char_utf32( | ||
503 | const utf32_t *s, | ||
504 | size_t len, | ||
505 | size_t *ip) | ||
506 | { | ||
507 | assert(*ip <= len); | ||
508 | if (*ip == len) | ||
509 | return EOS; | ||
510 | return s[(*ip)++]; | ||
511 | } | ||
512 | |||
513 | /** | ||
514 | * Sets the line breaking information for a generic input string. | ||
515 | * | ||
516 | * @param[in] s input string | ||
517 | * @param[in] len length of the input | ||
518 | * @param[in] lang language of the input | ||
519 | * @param[out] brks pointer to the output breaking data, | ||
520 | * containing #LINEBREAK_MUSTBREAK, | ||
521 | * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK, | ||
522 | * or #LINEBREAK_INSIDEACHAR | ||
523 | * @param[in] get_next_char function to get the next UTF-32 character | ||
524 | */ | ||
525 | void set_linebreaks( | ||
526 | const void *s, | ||
527 | size_t len, | ||
528 | const char *lang, | ||
529 | char *brks, | ||
530 | get_next_char_t get_next_char) | ||
531 | { | ||
532 | utf32_t ch; | ||
533 | enum LineBreakClass lbcCur; | ||
534 | enum LineBreakClass lbcNew; | ||
535 | enum LineBreakClass lbcLast; | ||
536 | struct LineBreakProperties *lbpLang; | ||
537 | size_t posCur = 0; | ||
538 | size_t posLast = 0; | ||
539 | |||
540 | --posLast; /* To be ++'d later */ | ||
541 | ch = get_next_char(s, len, &posCur); | ||
542 | if (ch == EOS) | ||
543 | return; | ||
544 | lbpLang = get_lb_prop_lang(lang); | ||
545 | lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang); | ||
546 | lbcNew = LBP_Undefined; | ||
547 | |||
548 | nextline: | ||
549 | |||
550 | /* Special treatment for the first character */ | ||
551 | switch (lbcCur) | ||
552 | { | ||
553 | case LBP_LF: | ||
554 | case LBP_NL: | ||
555 | lbcCur = LBP_BK; | ||
556 | break; | ||
557 | case LBP_CB: | ||
558 | lbcCur = LBP_BA; | ||
559 | break; | ||
560 | case LBP_SP: | ||
561 | lbcCur = LBP_WJ; | ||
562 | break; | ||
563 | default: | ||
564 | break; | ||
565 | } | ||
566 | |||
567 | /* Process a line till an explicit break or end of string */ | ||
568 | for (;;) | ||
569 | { | ||
570 | for (++posLast; posLast < posCur - 1; ++posLast) | ||
571 | { | ||
572 | brks[posLast] = LINEBREAK_INSIDEACHAR; | ||
573 | } | ||
574 | assert(posLast == posCur - 1); | ||
575 | lbcLast = lbcNew; | ||
576 | ch = get_next_char(s, len, &posCur); | ||
577 | if (ch == EOS) | ||
578 | break; | ||
579 | lbcNew = get_char_lb_class_lang(ch, lbpLang); | ||
580 | if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF)) | ||
581 | { | ||
582 | brks[posLast] = LINEBREAK_MUSTBREAK; | ||
583 | lbcCur = resolve_lb_class(lbcNew, lang); | ||
584 | goto nextline; | ||
585 | } | ||
586 | |||
587 | switch (lbcNew) | ||
588 | { | ||
589 | case LBP_SP: | ||
590 | brks[posLast] = LINEBREAK_NOBREAK; | ||
591 | continue; | ||
592 | case LBP_BK: | ||
593 | case LBP_LF: | ||
594 | case LBP_NL: | ||
595 | brks[posLast] = LINEBREAK_NOBREAK; | ||
596 | lbcCur = LBP_BK; | ||
597 | continue; | ||
598 | case LBP_CR: | ||
599 | brks[posLast] = LINEBREAK_NOBREAK; | ||
600 | lbcCur = LBP_CR; | ||
601 | continue; | ||
602 | case LBP_CB: | ||
603 | brks[posLast] = LINEBREAK_ALLOWBREAK; | ||
604 | lbcCur = LBP_BA; | ||
605 | continue; | ||
606 | default: | ||
607 | break; | ||
608 | } | ||
609 | |||
610 | lbcNew = resolve_lb_class(lbcNew, lang); | ||
611 | |||
612 | assert(lbcCur <= LBP_JT); | ||
613 | assert(lbcNew <= LBP_JT); | ||
614 | switch (baTable[lbcCur - 1][lbcNew - 1]) | ||
615 | { | ||
616 | case DIR_BRK: | ||
617 | brks[posLast] = LINEBREAK_ALLOWBREAK; | ||
618 | break; | ||
619 | case CMI_BRK: | ||
620 | case IND_BRK: | ||
621 | if (lbcLast == LBP_SP) | ||
622 | { | ||
623 | brks[posLast] = LINEBREAK_ALLOWBREAK; | ||
624 | } | ||
625 | else | ||
626 | { | ||
627 | brks[posLast] = LINEBREAK_NOBREAK; | ||
628 | } | ||
629 | break; | ||
630 | case CMP_BRK: | ||
631 | brks[posLast] = LINEBREAK_NOBREAK; | ||
632 | if (lbcLast != LBP_SP) | ||
633 | continue; | ||
634 | break; | ||
635 | case PRH_BRK: | ||
636 | brks[posLast] = LINEBREAK_NOBREAK; | ||
637 | break; | ||
638 | } | ||
639 | |||
640 | lbcCur = lbcNew; | ||
641 | } | ||
642 | |||
643 | assert(posLast == posCur - 1 && posCur <= len); | ||
644 | /* Break after the last character */ | ||
645 | brks[posLast] = LINEBREAK_MUSTBREAK; | ||
646 | /* When the input contains incomplete sequences */ | ||
647 | while (posCur < len) | ||
648 | { | ||
649 | brks[posCur++] = LINEBREAK_INSIDEACHAR; | ||
650 | } | ||
651 | } | ||
652 | |||
653 | /** | ||
654 | * Sets the line breaking information for a UTF-8 input string. | ||
655 | * | ||
656 | * @param[in] s input UTF-8 string | ||
657 | * @param[in] len length of the input | ||
658 | * @param[in] lang language of the input | ||
659 | * @param[out] brks pointer to the output breaking data, containing | ||
660 | * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, | ||
661 | * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR | ||
662 | */ | ||
663 | void set_linebreaks_utf8( | ||
664 | const utf8_t *s, | ||
665 | size_t len, | ||
666 | const char *lang, | ||
667 | char *brks) | ||
668 | { | ||
669 | set_linebreaks(s, len, lang, brks, | ||
670 | (get_next_char_t)lb_get_next_char_utf8); | ||
671 | } | ||
672 | |||
673 | /** | ||
674 | * Sets the line breaking information for a UTF-16 input string. | ||
675 | * | ||
676 | * @param[in] s input UTF-16 string | ||
677 | * @param[in] len length of the input | ||
678 | * @param[in] lang language of the input | ||
679 | * @param[out] brks pointer to the output breaking data, containing | ||
680 | * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, | ||
681 | * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR | ||
682 | */ | ||
683 | void set_linebreaks_utf16( | ||
684 | const utf16_t *s, | ||
685 | size_t len, | ||
686 | const char *lang, | ||
687 | char *brks) | ||
688 | { | ||
689 | set_linebreaks(s, len, lang, brks, | ||
690 | (get_next_char_t)lb_get_next_char_utf16); | ||
691 | } | ||
692 | |||
693 | /** | ||
694 | * Sets the line breaking information for a UTF-32 input string. | ||
695 | * | ||
696 | * @param[in] s input UTF-32 string | ||
697 | * @param[in] len length of the input | ||
698 | * @param[in] lang language of the input | ||
699 | * @param[out] brks pointer to the output breaking data, containing | ||
700 | * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, | ||
701 | * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR | ||
702 | */ | ||
703 | void set_linebreaks_utf32( | ||
704 | const utf32_t *s, | ||
705 | size_t len, | ||
706 | const char *lang, | ||
707 | char *brks) | ||
708 | { | ||
709 | set_linebreaks(s, len, lang, brks, | ||
710 | (get_next_char_t)lb_get_next_char_utf32); | ||
711 | } | ||
712 | |||
713 | /** | ||
714 | * Tells whether a line break can occur between two Unicode characters. | ||
715 | * This is a wrapper function to expose a simple interface. Generally | ||
716 | * speaking, it is better to use #set_linebreaks_utf32 instead, since | ||
717 | * complicated cases involving combining marks, spaces, etc. cannot be | ||
718 | * correctly processed. | ||
719 | * | ||
720 | * @param char1 the first Unicode character | ||
721 | * @param char2 the second Unicode character | ||
722 | * @param lang language of the input | ||
723 | * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, | ||
724 | * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR | ||
725 | */ | ||
726 | int is_line_breakable( | ||
727 | utf32_t char1, | ||
728 | utf32_t char2, | ||
729 | const char* lang) | ||
730 | { | ||
731 | utf32_t s[2]; | ||
732 | char brks[2]; | ||
733 | s[0] = char1; | ||
734 | s[1] = char2; | ||
735 | set_linebreaks_utf32(s, 2, lang, brks); | ||
736 | return brks[0]; | ||
737 | } | ||