1 /*
2    +----------------------------------------------------------------------+
3    | This source file is subject to version 3.01 of the PHP license,      |
4    | that is bundled with this package in the file LICENSE, and is        |
5    | available through the world-wide-web at the following url:           |
6    | https://www.php.net/license/3_01.txt                                 |
7    | If you did not receive a copy of the PHP license and are unable to   |
8    | obtain it through the world-wide-web, please send a note to          |
9    | license@php.net so we can mail you a copy immediately.               |
10    +----------------------------------------------------------------------+
11    | Authors: Gustavo Lopes <cataphract@php.net>                          |
12    +----------------------------------------------------------------------+
13  */
14 
15 #include "codepointiterator_internal.h"
16 #include <unicode/uchriter.h>
17 #include <typeinfo>
18 
19 #include "php.h"
20 
21 //copied from cmemory.h, which is not public
22 typedef union {
23     zend_long    t1;
24     double  t2;
25     void   *t3;
26 } UAlignedMemory;
27 
28 #define U_POINTER_MASK_LSB(ptr, mask) (((ptrdiff_t)(char *)(ptr)) & (mask))
29 #define U_ALIGNMENT_OFFSET(ptr) U_POINTER_MASK_LSB(ptr, sizeof(UAlignedMemory) - 1)
30 #define U_ALIGNMENT_OFFSET_UP(ptr) (sizeof(UAlignedMemory) - U_ALIGNMENT_OFFSET(ptr))
31 
32 using namespace PHP;
33 
34 using icu::UCharCharacterIterator;
35 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CodePointBreakIterator)36 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CodePointBreakIterator)
37 
38 CodePointBreakIterator::CodePointBreakIterator()
39 : BreakIterator(), lastCodePoint(U_SENTINEL), fCharIter(NULL)
40 {
41 	UErrorCode uec = UErrorCode();
42 	this->fText = utext_openUChars(NULL, NULL, 0, &uec);
43 }
44 
CodePointBreakIterator(const PHP::CodePointBreakIterator & other)45 CodePointBreakIterator::CodePointBreakIterator(const PHP::CodePointBreakIterator &other)
46 : BreakIterator(other), fText(NULL), lastCodePoint(U_SENTINEL), fCharIter(NULL)
47 {
48 	*this = other;
49 }
50 
operator =(const CodePointBreakIterator & that)51 CodePointBreakIterator& CodePointBreakIterator::operator=(const CodePointBreakIterator& that)
52 {
53 	UErrorCode uec = UErrorCode();
54 
55 	if (this == &that) {
56 		return *this;
57 	}
58 
59 	this->fText = utext_clone(this->fText, that.fText, false, true, &uec);
60 
61 	//don't bother copying the character iterator, getText() is deprecated
62 	clearCurrentCharIter();
63 
64 	this->lastCodePoint = that.lastCodePoint;
65 	return *this;
66 }
67 
~CodePointBreakIterator()68 CodePointBreakIterator::~CodePointBreakIterator()
69 {
70 	if (this->fText) {
71 		utext_close(this->fText);
72 	}
73 	clearCurrentCharIter();
74 }
75 
76 #if U_ICU_VERSION_MAJOR_NUM >= 70
operator ==(const BreakIterator & that) const77 bool CodePointBreakIterator::operator==(const BreakIterator& that) const
78 #else
79 UBool CodePointBreakIterator::operator==(const BreakIterator& that) const
80 #endif
81 {
82 	if (typeid(*this) != typeid(that)) {
83 		return false;
84 	}
85 
86 	const CodePointBreakIterator& that2 =
87 		static_cast<const CodePointBreakIterator&>(that);
88 
89 	if (!utext_equals(this->fText, that2.fText)) {
90 		return false;
91 	}
92 
93 	return true;
94 }
95 
clone(void) const96 CodePointBreakIterator* CodePointBreakIterator::clone(void) const
97 {
98 	return new CodePointBreakIterator(*this);
99 }
100 
getText(void) const101 CharacterIterator& CodePointBreakIterator::getText(void) const
102 {
103 	if (this->fCharIter == NULL) {
104 		//this method is deprecated anyway; setup bogus iterator
105 		static const UChar c = 0;
106 		this->fCharIter = new UCharCharacterIterator(&c, 0);
107 	}
108 
109 	return *this->fCharIter;
110 }
111 
getUText(UText * fillIn,UErrorCode & status) const112 UText *CodePointBreakIterator::getUText(UText *fillIn, UErrorCode &status) const
113 {
114 	return utext_clone(fillIn, this->fText, false, true, &status);
115 }
116 
setText(const UnicodeString & text)117 void CodePointBreakIterator::setText(const UnicodeString &text)
118 {
119 	UErrorCode uec = UErrorCode();
120 
121 	//this closes the previous utext, if any
122 	this->fText = utext_openConstUnicodeString(this->fText, &text, &uec);
123 
124 	clearCurrentCharIter();
125 }
126 
setText(UText * text,UErrorCode & status)127 void CodePointBreakIterator::setText(UText *text, UErrorCode &status)
128 {
129 	if (U_FAILURE(status)) {
130 		return;
131 	}
132 
133 	this->fText = utext_clone(this->fText, text, false, true, &status);
134 
135 	clearCurrentCharIter();
136 }
137 
adoptText(CharacterIterator * it)138 void CodePointBreakIterator::adoptText(CharacterIterator* it)
139 {
140 	UErrorCode uec = UErrorCode();
141 	clearCurrentCharIter();
142 
143 	this->fCharIter = it;
144 	this->fText = utext_openCharacterIterator(this->fText, it, &uec);
145 }
146 
first(void)147 int32_t CodePointBreakIterator::first(void)
148 {
149 	UTEXT_SETNATIVEINDEX(this->fText, 0);
150 	this->lastCodePoint = U_SENTINEL;
151 
152 	return 0;
153 }
154 
last(void)155 int32_t CodePointBreakIterator::last(void)
156 {
157 	int32_t pos = (int32_t)utext_nativeLength(this->fText);
158 	UTEXT_SETNATIVEINDEX(this->fText, pos);
159 	this->lastCodePoint = U_SENTINEL;
160 
161 	return pos;
162 }
163 
previous(void)164 int32_t CodePointBreakIterator::previous(void)
165 {
166 	this->lastCodePoint = UTEXT_PREVIOUS32(this->fText);
167 	if (this->lastCodePoint == U_SENTINEL) {
168 		return BreakIterator::DONE;
169 	}
170 
171 	return (int32_t)UTEXT_GETNATIVEINDEX(this->fText);
172 }
173 
next(void)174 int32_t CodePointBreakIterator::next(void)
175 {
176 	this->lastCodePoint = UTEXT_NEXT32(this->fText);
177 	if (this->lastCodePoint == U_SENTINEL) {
178 		return BreakIterator::DONE;
179 	}
180 
181 	return (int32_t)UTEXT_GETNATIVEINDEX(this->fText);
182 }
183 
current(void) const184 int32_t CodePointBreakIterator::current(void) const
185 {
186 	return (int32_t)UTEXT_GETNATIVEINDEX(this->fText);
187 }
188 
following(int32_t offset)189 int32_t CodePointBreakIterator::following(int32_t offset)
190 {
191 	this->lastCodePoint = utext_next32From(this->fText, offset);
192 	if (this->lastCodePoint == U_SENTINEL) {
193 		return BreakIterator::DONE;
194 	}
195 
196 	return (int32_t)UTEXT_GETNATIVEINDEX(this->fText);
197 }
198 
preceding(int32_t offset)199 int32_t CodePointBreakIterator::preceding(int32_t offset)
200 {
201 	this->lastCodePoint = utext_previous32From(this->fText, offset);
202 	if (this->lastCodePoint == U_SENTINEL) {
203 		return BreakIterator::DONE;
204 	}
205 
206 	return (int32_t)UTEXT_GETNATIVEINDEX(this->fText);
207 }
208 
isBoundary(int32_t offset)209 UBool CodePointBreakIterator::isBoundary(int32_t offset)
210 {
211 	//this function has side effects, and it's supposed to
212 	utext_setNativeIndex(this->fText, offset);
213 	return (offset == utext_getNativeIndex(this->fText));
214 }
215 
next(int32_t n)216 int32_t CodePointBreakIterator::next(int32_t n)
217 {
218 	UBool res = utext_moveIndex32(this->fText, n);
219 
220 #ifndef UTEXT_CURRENT32
221 #define UTEXT_CURRENT32 utext_current32
222 #endif
223 
224 	if (res) {
225 		this->lastCodePoint = UTEXT_CURRENT32(this->fText);
226 		return (int32_t)UTEXT_GETNATIVEINDEX(this->fText);
227 	} else {
228 		this->lastCodePoint = U_SENTINEL;
229 		return BreakIterator::DONE;
230 	}
231 }
232 
createBufferClone(void * stackBuffer,int32_t & bufferSize,UErrorCode & status)233 CodePointBreakIterator *CodePointBreakIterator::createBufferClone(
234 	void *stackBuffer, int32_t &bufferSize, UErrorCode &status)
235 {
236 	//see implementation of RuleBasedBreakIterator::createBufferClone()
237 	if (U_FAILURE(status)) {
238 		return NULL;
239 	}
240 
241 	if (bufferSize <= 0) {
242 		bufferSize = sizeof(CodePointBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
243 		return NULL;
244 	}
245 
246 	char *buf = (char*)stackBuffer;
247 	uint32_t s = bufferSize;
248 
249 	if (stackBuffer == NULL) {
250 		 s = 0;
251 	}
252 
253 	if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
254 		uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
255 		s -= offsetUp;
256 		buf += offsetUp;
257 	}
258 
259 	if (s < sizeof(CodePointBreakIterator)) {
260 		CodePointBreakIterator *clonedBI = new CodePointBreakIterator(*this);
261 		if (clonedBI == NULL) {
262 			status = U_MEMORY_ALLOCATION_ERROR;
263 		} else {
264 			status = U_SAFECLONE_ALLOCATED_WARNING;
265 		}
266 
267 		return clonedBI;
268 	}
269 
270 	return new(buf) CodePointBreakIterator(*this);
271 }
272 
refreshInputText(UText * input,UErrorCode & status)273 CodePointBreakIterator &CodePointBreakIterator::refreshInputText(UText *input, UErrorCode &status)
274 {
275 	//see implementation of RuleBasedBreakIterator::createBufferClone()
276 	if (U_FAILURE(status)) {
277 		return *this;
278 	}
279 	if (input == NULL) {
280 		status = U_ILLEGAL_ARGUMENT_ERROR;
281 		return *this;
282 	}
283 
284 	int64_t pos = utext_getNativeIndex(this->fText);
285 	this->fText = utext_clone(this->fText, input, false, true, &status);
286 	if (U_FAILURE(status)) {
287 		return *this;
288 	}
289 
290 	utext_setNativeIndex(this->fText, pos);
291 	if (utext_getNativeIndex(fText) != pos) {
292 		status = U_ILLEGAL_ARGUMENT_ERROR;
293 	}
294 
295 	return *this;
296 }
297