/* vim: set tabstop=4 shiftwidth=4: */

/*
 * Word breaking in a Unicode sequence.  Designed to be used in a
 * generic text renderer.
 *
 * Copyright (C) 2011-2011 Tom Hacohen <tom@stosb.com>
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the author be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute
 * it freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must
 *    not claim that you wrote the original software.  If you use this
 *    software in a product, an acknowledgement in the product
 *    documentation would be appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must
 *    not be misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source
 *    distribution.
 *
 * The main reference is Unicode Standard Annex 29 (UAX #29):
 *		<URL:http://unicode.org/reports/tr29>
 *
 * When this library was designed, this annex was at Revision 17, for
 * Unicode 6.0.0:
 *		<URL:http://www.unicode.org/reports/tr29/tr29-17.html>
 *
 * The Unicode Terms of Use are available at
 *		<URL:http://www.unicode.org/copyright.html>
 */

/**
 * @file	wordbreak.c
 *
 * Implementation of the word breaking algorithm as described in Unicode
 * Standard Annex 29.
 *
 * @version	2.0, 2011/12/12
 * @author	Tom Hacohen
 */


#include <assert.h>
#include <stddef.h>
#include <string.h>
#include "linebreak.h"
#include "linebreakdef.h"

#include "wordbreak.h"
#include "wordbreakdata.x"

#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))

/* Init the wordbreak internals. */
void init_wordbreak(void)
{
	/* Currently does nothing, may be needed in the future. */
	return;
}

/**
 * Gets the word breaking class of a character.
 *
 * @param ch	character to check
 * @param wbp	pointer to the wbp breaking properties array
 * @param len	the size of the wbp array in number of items.
 * @return		the word breaking class if found; \c WBP_Any otherwise
 */
static enum WordBreakClass get_char_wb_class(
		utf32_t ch,
		struct WordBreakProperties *wbp,
		size_t len)
{
	int min = 0;
	int max = len - 1;
	int mid;

	do
	{
		mid = (min + max) / 2;

		if (ch < wbp[mid].start)
			max = mid - 1;
		else if (ch > wbp[mid].end)
			min = mid + 1;
		else
			return wbp[mid].prop;
	}
	while (min <= max);

	return WBP_Any;
}

/**
 * Sets the break types in brks starting from posLast up to posStop.
 *
 * It sets the inside chars to #WORDBREAK_INSIDECHAR and the rest to brkType.
 * Assumes brks is initialized - all the cells with #WORDBREAK_NOBREAK are
 * cells that we really don't want to break after.
 *
 * @param s				the string
 * @param brks[out]		the breaks array to fill.
 * @param posStart		the start position
 * @param posEnd		the end position
 * @param len			the length of the string
 * @param brkType		the breaks type to use
 * @param get_next_char	function to get the next UTF-32 character
 */
static void set_brks_to(const void *s,
		char *brks,
		size_t posStart,
		size_t posEnd,
		size_t len,
		char brkType,
		get_next_char_t get_next_char)
{
	size_t posCur = posStart;
	while (posCur < posEnd)
	{
		get_next_char(s, len, &posCur);
		for ( ; posStart < posCur - 1; ++posStart)
		{
			brks[posStart] = WORDBREAK_INSIDECHAR;
		}
		assert(posStart == posCur - 1);

		/* Only set it if we haven't set it not to break before. */
		if (brks[posStart] != WORDBREAK_NOBREAK)
			brks[posStart] = brkType;
		posStart = posCur;
	}
}

/* Checks to see if newline, cr, or lf. for WB3a and b */
#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
		(cls == WBP_LF))

/**
 * Sets the word breaking information for a generic input string.
 *
 * @param[in]  s			input string
 * @param[in]  len			length of the input
 * @param[in]  lang			language of the input
 * @param[out] brks			pointer to the output breaking data, containing
 *							#WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *							#WORDBREAK_INSIDEACHAR
 * @param[in] get_next_char	function to get the next UTF-32 character
 */
static void set_wordbreaks(
		const void *s,
		size_t len,
		const char *lang,
		char *brks,
		get_next_char_t get_next_char)
{
	/* Previous class */
	enum WordBreakClass p_cls = WBP_Undefined;
	/* Strong previous class. */
	enum WordBreakClass sp_cls = WBP_Undefined;
	utf32_t ch;
	size_t posCur = 0;
	size_t posCurSt = 0;
	size_t posLast = 0;

	/* FIXME: unused atm. */
	(void) lang;


	/* Init brks */
	memset(brks, WORDBREAK_BREAK, len);

	ch = get_next_char(s, len, &posCur);

	/* WB3a, WB3b are implied. */
	for ( ; ch != EOS ; )
	{
		/* Current class */
		enum WordBreakClass c_cls;
		c_cls = get_char_wb_class(ch, wb_prop_default,
				ARRAY_LEN(wb_prop_default));

		switch (c_cls)
		{
	    case WBP_CR:
			set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
					get_next_char);
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_LF:
			if (sp_cls == WBP_CR) /* WB3 */
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
						get_next_char);
				sp_cls = c_cls;
				posLast = posCurSt;
			}
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_Newline:
			/* WB3a, WB3b */
			set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
					get_next_char);
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_Extend:
	    case WBP_Format:
			/* WB4 - If not the first char/after a newline (W3ab),
			 * skip this class, set it to be the same as the prev, and mark
			 * brks not to break before them. */
			if ((sp_cls == WBP_Undefined) || IS_WB3ab(sp_cls))
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
				sp_cls = c_cls;
			}
			else
			{
				/* It's surely not the first */
				brks[posCurSt - 1] = WORDBREAK_NOBREAK;
				/* "inherit" the previous class. */
				c_cls = p_cls;
			}
			break;

	    case WBP_Katakana:
			if ((sp_cls == WBP_Katakana) || /* WB13 */
					(sp_cls == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
						get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
			}
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_ALetter:
			if ((sp_cls == WBP_ALetter) || /* WB5,6,7 */
					((sp_cls == WBP_Numeric) && (p_cls == WBP_Numeric)) || /* WB10 */
					(sp_cls == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
						get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
			}
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_MidNumLet:
			if ((p_cls == WBP_ALetter) || /* WBP6,7 */
					(p_cls == WBP_Numeric)) /* WBP11,12 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
				sp_cls = c_cls;
				posLast = posCurSt;
			}
			break;

	    case WBP_MidLetter:
			if (p_cls == WBP_ALetter) /* WBP6,7 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
				sp_cls = c_cls;
				posLast = posCurSt;
			}
			break;

	    case WBP_MidNum:
			if (p_cls == WBP_Numeric) /* WBP11,12 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
				sp_cls = c_cls;
				posLast = posCurSt;
			}
			break;

	    case WBP_Numeric:
			if ((sp_cls == WBP_Numeric) || /* WB8,11,12 */
					((sp_cls == WBP_ALetter) && (p_cls == WBP_ALetter)) || /* WB9 */
					(sp_cls == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
						get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
			}
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_ExtendNumLet:
			/* WB13a,13b */
			if ((sp_cls == p_cls) &&
				((p_cls == WBP_ALetter) ||
				 (p_cls == WBP_Numeric) ||
				 (p_cls == WBP_Katakana) ||
				 (p_cls == WBP_ExtendNumLet)))
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
						get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
			}
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

		 case WBP_Any:
			/* Allow breaks and reset */
			set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
					get_next_char);
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    default:
			/* Error, should never get here! */
			assert(0);
			break;
		}

		p_cls = c_cls;
		posCurSt = posCur;
		ch = get_next_char(s, len, &posCur);
    }

	/* WB2 */
	set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK,
			get_next_char);
}

/**
 * Sets the word breaking information for a UTF-8 input string.
 *
 * @param[in]  s	input UTF-8 string
 * @param[in]  len	length of the input
 * @param[in]  lang	language of the input
 * @param[out] brks	pointer to the output breaking data, containing
 *					#WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *					#WORDBREAK_INSIDEACHAR
 */
void set_wordbreaks_utf8(
		const utf8_t *s,
		size_t len,
		const char *lang,
		char *brks)
{
	set_wordbreaks(s, len, lang, brks,
				   (get_next_char_t)lb_get_next_char_utf8);
}

/**
 * Sets the word breaking information for a UTF-16 input string.
 *
 * @param[in]  s	input UTF-16 string
 * @param[in]  len	length of the input
 * @param[in]  lang	language of the input
 * @param[out] brks	pointer to the output breaking data, containing
 *					#WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *					#WORDBREAK_INSIDEACHAR
 */
void set_wordbreaks_utf16(
		const utf16_t *s,
		size_t len,
		const char *lang,
		char *brks)
{
	set_wordbreaks(s, len, lang, brks,
				   (get_next_char_t)lb_get_next_char_utf16);
}

/**
 * Sets the word breaking information for a UTF-32 input string.
 *
 * @param[in]  s	input UTF-32 string
 * @param[in]  len	length of the input
 * @param[in]  lang	language of the input
 * @param[out] brks	pointer to the output breaking data, containing
 *					#WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *					#WORDBREAK_INSIDEACHAR
 */
void set_wordbreaks_utf32(
		const utf32_t *s,
		size_t len,
		const char *lang,
		char *brks)
{
	set_wordbreaks(s, len, lang, brks,
				   (get_next_char_t)lb_get_next_char_utf32);
}