aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/eina/src/lib/eina_unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/eina/src/lib/eina_unicode.c')
-rw-r--r--libraries/eina/src/lib/eina_unicode.c405
1 files changed, 405 insertions, 0 deletions
diff --git a/libraries/eina/src/lib/eina_unicode.c b/libraries/eina/src/lib/eina_unicode.c
new file mode 100644
index 0000000..342e3cb
--- /dev/null
+++ b/libraries/eina/src/lib/eina_unicode.c
@@ -0,0 +1,405 @@
1/* EINA - EFL data type library
2 * Copyright (C) 2010 Tom Hacohen,
3 * Brett Nash
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library;
17 * if not, see <http://www.gnu.org/licenses/>.
18
19 */
20
21#include <Eina.h>
22#include "eina_unicode.h"
23
24/* FIXME: check if sizeof(wchar_t) == sizeof(Eina_Unicode) if so,
25 * probably better to use the standard functions */
26
27/* Maybe I'm too tired, but this is the only thing that actually worked. */
28const Eina_Unicode _EINA_UNICODE_EMPTY_STRING[1] = {0};
29EAPI const Eina_Unicode *EINA_UNICODE_EMPTY_STRING = _EINA_UNICODE_EMPTY_STRING;
30EAPI int
31eina_unicode_strcmp(const Eina_Unicode *a, const Eina_Unicode *b)
32{
33 for (; *a && *a == *b; a++, b++)
34 ;
35 if (*a == *b)
36 return 0;
37 else if (*a < *b)
38 return -1;
39 else
40 return 1;
41}
42
43EAPI Eina_Unicode *
44eina_unicode_strcpy(Eina_Unicode *dest, const Eina_Unicode *source)
45{
46 Eina_Unicode *ret = dest;
47
48 while (*source)
49 *dest++ = *source++;
50 *dest = 0;
51 return ret;
52}
53
54EAPI Eina_Unicode *
55eina_unicode_strncpy(Eina_Unicode *dest, const Eina_Unicode *source, size_t n)
56{
57 Eina_Unicode *ret = dest;
58
59 for ( ; n && *source ; n--)
60 *dest++ = *source++;
61 for (; n; n--)
62 *dest++ = 0;
63 return ret;
64}
65
66EAPI size_t
67eina_unicode_strlen(const Eina_Unicode *ustr)
68{
69 const Eina_Unicode *end;
70 for (end = ustr; *end; end++)
71 ;
72 return end - ustr;
73}
74
75EAPI size_t
76eina_unicode_strnlen(const Eina_Unicode *ustr, int n)
77{
78 const Eina_Unicode *end;
79 const Eina_Unicode *last = ustr + n; /* technically not portable ;-) */
80 for (end = ustr; end < last && *end; end++)
81 ;
82 return end - ustr;
83}
84
85
86
87
88EAPI Eina_Unicode *
89eina_unicode_strndup(const Eina_Unicode *text, size_t n)
90{
91 Eina_Unicode *ustr;
92
93 ustr = (Eina_Unicode *) malloc((n + 1) * sizeof(Eina_Unicode));
94 memcpy(ustr, text, n * sizeof(Eina_Unicode));
95 ustr[n] = 0;
96 return ustr;
97}
98
99EAPI Eina_Unicode *
100eina_unicode_strdup(const Eina_Unicode *text)
101{
102 size_t len;
103
104 len = eina_unicode_strlen(text);
105 return eina_unicode_strndup(text, len);
106}
107
108EAPI Eina_Unicode *
109eina_unicode_strstr(const Eina_Unicode *haystack, const Eina_Unicode *needle)
110{
111 const Eina_Unicode *i, *j;
112
113 for (i = haystack; *i; i++)
114 {
115 haystack = i; /* set this location as the base position */
116 for (j = needle; *j && *i && *j == *i; j++, i++)
117 ;
118
119 if (!*j) /*if we got to the end of j this means we got a full match */
120 {
121 return (Eina_Unicode *)haystack; /* return the new base position */
122 }
123 }
124
125 return NULL;
126}
127
128EAPI Eina_Unicode *
129eina_unicode_escape(const Eina_Unicode *str)
130{
131 Eina_Unicode *s2, *d;
132 const Eina_Unicode *s;
133
134 s2 = malloc((eina_unicode_strlen(str) * 2) + 1);
135 if (!s2)
136 return NULL;
137
138 for (s = str, d = s2; *s != 0; s++, d++)
139 {
140 if ((*s == ' ') || (*s == '\\') || (*s == '\''))
141 {
142 *d = '\\';
143 d++;
144 }
145
146 *d = *s;
147 }
148 *d = 0;
149 return s2;
150}
151
152/* UTF-8 Handling */
153
154#define EINA_UNICODE_UTF8_BYTES_PER_CHAR 6
155/* The replacement range that will be used for bad utf8 chars. */
156#define ERROR_REPLACEMENT_BASE 0xDC80
157#define ERROR_REPLACEMENT_END 0xDCFF
158#define IS_INVALID_BYTE(x) ((x == 192) || (x == 193) || (x >= 245))
159#define IS_CONTINUATION_BYTE(x) ((x & 0xC0) == 0x80)
160
161EAPI Eina_Unicode
162eina_unicode_utf8_get_next(const char *buf, int *iindex)
163{
164 int ind = *iindex;
165 Eina_Unicode r;
166 unsigned char d;
167
168 /* if this char is the null terminator, exit */
169 if ((d = buf[ind++]) == 0) return 0;
170
171 if ((d & 0x80) == 0)
172 { // 1 byte (7bit) - 0xxxxxxx
173 *iindex = ind;
174 return d;
175 }
176 if ((d & 0xe0) == 0xc0)
177 { // 2 byte (11bit) - 110xxxxx 10xxxxxx
178 r = (d & 0x1f) << 6;
179 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
180 !IS_CONTINUATION_BYTE(d)) goto error;
181 r |= (d & 0x3f);
182 if (r <= 0x7F) goto error;
183 *iindex = ind;
184 return r;
185 }
186 if ((d & 0xf0) == 0xe0)
187 { // 3 byte (16bit) - 1110xxxx 10xxxxxx 10xxxxxx
188 r = (d & 0x0f) << 12;
189 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
190 !IS_CONTINUATION_BYTE(d)) goto error;
191 r |= (d & 0x3f) << 6;
192 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
193 !IS_CONTINUATION_BYTE(d)) goto error;
194 r |= (d & 0x3f);
195 if (r <= 0x7FF) goto error;
196 *iindex = ind;
197 return r;
198 }
199 if ((d & 0xf8) == 0xf0)
200 { // 4 byte (21bit) - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
201 r = (d & 0x07) << 18;
202 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
203 !IS_CONTINUATION_BYTE(d)) goto error;
204 r |= (d & 0x3f) << 12;
205 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
206 !IS_CONTINUATION_BYTE(d)) goto error;
207 r |= (d & 0x3f) << 6;
208 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
209 !IS_CONTINUATION_BYTE(d)) goto error;
210 r |= (d & 0x3f);
211 if (r <= 0xFFFF) goto error;
212 *iindex = ind;
213 return r;
214 }
215 if ((d & 0xfc) == 0xf8)
216 { // 5 byte (26bit) - 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
217 r = (d & 0x03) << 24;
218 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
219 !IS_CONTINUATION_BYTE(d)) goto error;
220 r |= (d & 0x3f) << 18;
221 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
222 !IS_CONTINUATION_BYTE(d)) goto error;
223 r |= (d & 0x3f) << 12;
224 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
225 !IS_CONTINUATION_BYTE(d)) goto error;
226 r |= (d & 0x3f) << 6;
227 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
228 !IS_CONTINUATION_BYTE(d)) goto error;
229 r |= (d & 0x3f);
230 if (r <= 0x1FFFFF) goto error;
231 *iindex = ind;
232 return r;
233 }
234 if ((d & 0xfe) == 0xfc)
235 { // 6 byte (31bit) - 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
236 r = (d & 0x01) << 30;
237 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
238 !IS_CONTINUATION_BYTE(d)) goto error;
239 r |= (d & 0x3f) << 24;
240 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
241 !IS_CONTINUATION_BYTE(d)) goto error;
242 r |= (d & 0x3f) << 18;
243 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
244 !IS_CONTINUATION_BYTE(d)) goto error;
245 r |= (d & 0x3f) << 12;
246 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
247 !IS_CONTINUATION_BYTE(d)) goto error;
248 r |= (d & 0x3f) << 6;
249 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
250 !IS_CONTINUATION_BYTE(d)) goto error;
251 r |= (d & 0x3f);
252 if (r <= 0x3FFFFFF) goto error;
253 *iindex = ind;
254 return r;
255 }
256
257/* Gets here where there was an error and we want to replace the char
258 * we just use the invalid unicode codepoints 8 lower bits represent
259 * the original char */
260error:
261 d = buf[*iindex];
262 (*iindex)++;
263 return ERROR_REPLACEMENT_BASE | d;
264}
265
266EAPI Eina_Unicode
267eina_unicode_utf8_get_prev(const char *buf, int *iindex)
268{
269 int r;
270 int ind = *iindex;
271 /* First obtain the codepoint at iindex */
272 r = eina_unicode_utf8_get_next(buf, &ind);
273
274 /* although when ind == 0 there's no previous char, we still want to get
275 * the current char */
276 if (*iindex <= 0)
277 return r;
278
279 /* Next advance iindex to previous codepoint */
280 ind = *iindex;
281 ind--;
282 while ((ind > 0) && ((buf[ind] & 0xc0) == 0x80))
283 ind--;
284
285 *iindex = ind;
286 return r;
287}
288
289EAPI int
290eina_unicode_utf8_get_len(const char *buf)
291{
292 /* returns the number of utf8 characters (not bytes) in the string */
293 int i = 0, len = 0;
294
295 while (eina_unicode_utf8_get_next(buf, &i))
296 len++;
297
298 return len;
299}
300
301EAPI Eina_Unicode *
302eina_unicode_utf8_to_unicode(const char *utf, int *_len)
303{
304 /* FIXME: Should optimize! */
305 int len, i;
306 int ind;
307 Eina_Unicode *buf, *uind;
308
309 len = eina_unicode_utf8_get_len(utf);
310 if (_len)
311 *_len = len;
312 buf = (Eina_Unicode *) calloc(sizeof(Eina_Unicode), (len + 1));
313 if (!buf) return buf;
314
315 for (i = 0, ind = 0, uind = buf ; i < len ; i++, uind++)
316 {
317 *uind = eina_unicode_utf8_get_next(utf, &ind);
318 }
319
320 return buf;
321}
322
323EAPI char *
324eina_unicode_unicode_to_utf8(const Eina_Unicode *uni, int *_len)
325{
326 char *buf;
327 const Eina_Unicode *uind;
328 char *ind;
329 int ulen, len;
330
331 ulen = eina_unicode_strlen(uni);
332 buf = (char *) calloc(ulen + 1, EINA_UNICODE_UTF8_BYTES_PER_CHAR);
333
334 len = 0;
335 for (uind = uni, ind = buf ; *uind ; uind++)
336 {
337 if (*uind <= 0x7F) /* 1 byte char */
338 {
339 *ind++ = *uind;
340 len += 1;
341 }
342 else if (*uind <= 0x7FF) /* 2 byte char */
343 {
344 *ind++ = 0xC0 | (unsigned char) (*uind >> 6);
345 *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
346 len += 2;
347 }
348 else if (*uind <= 0xFFFF) /* 3 byte char */
349 {
350 /* If it's a special replacement codepoint */
351 if (*uind >= ERROR_REPLACEMENT_BASE &&
352 *uind <= ERROR_REPLACEMENT_END)
353 {
354 *ind++ = *uind & 0xFF;
355 len += 1;
356 }
357 else
358 {
359 *ind++ = 0xE0 | (unsigned char) (*uind >> 12);
360 *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
361 *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
362 len += 3;
363 }
364 }
365 else if (*uind <= 0x1FFFFF) /* 4 byte char */
366 {
367 *ind++ = 0xF0 | (unsigned char) ((*uind >> 18) & 0x07);
368 *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
369 *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
370 *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
371 len += 4;
372 }
373 else if (*uind <= 0x3FFFFFF) /* 5 byte char */
374 {
375 *ind++ = 0xF8 | (unsigned char) ((*uind >> 24) & 0x03);
376 *ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F);
377 *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
378 *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
379 *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
380 len += 5;
381 }
382 else if (*uind <= 0x7FFFFFFF) /* 6 byte char */
383 {
384 *ind++ = 0xFC | (unsigned char) ((*uind >> 30) & 0x01);
385 *ind++ = 0x80 | (unsigned char) ((*uind >> 24) & 0x3F);
386 *ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F);
387 *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
388 *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
389 *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
390 len += 6;
391 }
392 else /* error */
393 {
394 /* Do something */
395 }
396 }
397 buf = realloc(buf, len + 1);
398 buf[len] = '\0';
399 if (_len)
400 *_len = len;
401 return buf;
402}
403
404
405