unstd.utf source code

1 /** Additions to $(STDMODULE _utf).
2 
3 Copyright: Denis Shelomovskij 2012-2013
4 
5 License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0).
6 
7 Authors: Denis Shelomovskij
8 */
9 module unstd.utf;
10 
11 
12 version(D_NoBoundsChecks) { }
13 else import core.exception;
14 
15 import std.traits;
16 public import std.utf;
17 
18 
19 @safe:
20 
21 /// Detect whether $(D c) is a UTF-8 continuation byte.
22 bool isContinuationByte(in char c) pure nothrow @nogc
23 {
24 	return (c & 0xC0) == 0x80;
25 }
26 
27 /// Detect whether $(D c) is a UTF-16 lead/trail surrogate or not a surrogate.
28 bool isLeadSurrogate(in wchar c) pure nothrow @nogc
29 {
30 	return c >= 0xD800 && c < 0xDC00;
31 }
32 
33 /// ditto
34 bool isTrailSurrogate(in wchar c) pure nothrow @nogc
35 {
36 	return c >= 0xDC00 && c < 0xE000;
37 }
38 
39 /// ditto
40 bool isValidBMPCharacter(in wchar c) pure nothrow @nogc
41 {
42 	return c < 0xD800 || c >= 0xE000;
43 }
44 
45 pure nothrow @nogc unittest
46 {
47 	import unstd.generictuple;
48 
49 	foreach(c; "zд"w)
50 	{
51 		assert(!isLeadSurrogate(c));
52 		assert(!isTrailSurrogate(c));
53 		assert( isValidBMPCharacter(c));
54 	}
55 
56 	foreach(i, c; "\U00010143"w)
57 	{
58 		assert(isLeadSurrogate(c) == !i);
59 		assert(isTrailSurrogate(c) == i);
60 		assert(!isValidBMPCharacter(c));
61 	}
62 }
63 
64 
65 /// Detect whether $(D c) is the first code unit in a sequence.
66 bool isSequenceStart(C)(in C c) pure nothrow @nogc
67 if(isSomeChar!C)
68 {
69 	static if(is(C : char))
70 		return !isContinuationByte(c);
71 	else static if(is(C : wchar))
72 		return !isTrailSurrogate(c);
73 	else static if(is(C : dchar))
74 		return true; // Always true
75 	else
76 		static assert(0);
77 }
78 
79 pure nothrow @nogc unittest
80 {
81 	import unstd.generictuple;
82 
83 	foreach(str; expressionTuple!("a", "a"w, "a"d, "д", "д"w, "д"d, "\U00010143"w, "\U00010143"d))
84 		assert(isSequenceStart(str[0]));
85 
86 	assert(!isSequenceStart("д"[1]));
87 	assert(!isSequenceStart("\U00010143"w[1]));
88 }
89 
90 
91 /**
92 Adjust $(D idx) to point at the start of a UTF sequence or
93 at the end of $(D str).
94 */
95 size_t adjustBack(C)(in C[] str, size_t idx) pure nothrow @nogc
96 if(isSomeChar!C)
97 in { assert(idx <= str.length); }
98 body
99 {
100 	static if(is(C : char))
101 	{
102 		if(idx != str.length)
103 		{
104 			foreach(_; 0 .. 4 - 1) // Don't expect 5 and 6 byte combinations
105 			{
106 				if(!isContinuationByte(str[idx]))
107 					return idx;
108 				assert(idx, "String starts from UTF-8 continuation byte.");
109 				--idx;
110 			}
111 			assert(!isContinuationByte(str[idx]), "UTF-8 sequence length exceeds 4 bytes.");
112 		}
113 	}
114 	else static if(is(C : wchar))
115 	{
116 		if(idx != str.length && isTrailSurrogate(str[idx]))
117 		{
118 			assert(idx, "String starts from UTF-16 trail surrogate.");
119 			--idx;
120 			assert(isLeadSurrogate(str[idx]), "UTF-16 lead surrogate expected before trail surrogate.");
121 		}
122 	}
123 	else
124 	{
125 		static assert(is(C : dchar));
126 	}
127 
128 	return idx;
129 }
130 
131 pure nothrow @nogc unittest
132 {
133 	assert("a".adjustBack(0) == 0);
134 	assert("a".adjustBack(1) == 1);
135 	assert("ab".adjustBack(1) == 1);
136 	assert("д".adjustBack(1) == 0);
137 	assert("дb".adjustBack(2) == 2);
138 	foreach(i; 0 .. 4)
139 		assert("\U00010143".adjustBack(i) == 0);
140 	assert("\U00010143".adjustBack(4) == 4);
141 	assert("\U00010143"w.adjustBack(1) == 0);
142 }
143 
144 /// ditto
145 size_t adjustForward(C)(in C[] str, size_t idx) pure nothrow @nogc
146 in { assert(idx <= str.length); }
147 body
148 {
149 	static if(is(C : char))
150 	{
151 		if(idx != str.length)
152 		{
153 			foreach(_; 0 .. 4 - 1) // Don't expect 5 and 6 byte combinations
154 			{
155 				if(idx == str.length || !isContinuationByte(str[idx]))
156 					return idx;
157 				++idx;
158 			}
159 			assert(idx == str.length || !isContinuationByte(str[idx]), "UTF-8 sequence length exceeds 4 bytes.");
160 		}
161 	}
162 	else static if(is(C : wchar))
163 	{
164 		if(idx != str.length && isTrailSurrogate(str[idx]))
165 		{
166 			++idx;
167 			assert(idx == str.length || isLeadSurrogate(str[idx]), "UTF-16 lead surrogate expected after trail surrogate.");
168 		}
169 	}
170 	else
171 	{
172 		static assert(is(C : dchar));
173 	}
174 
175 	return idx;
176 }
177 
178 pure nothrow @nogc unittest
179 {
180 	assert("a".adjustForward(0) == 0);
181 	assert("a".adjustForward(1) == 1);
182 	assert("ab".adjustForward(1) == 1);
183 	assert("д".adjustForward(1) == 2);
184 	assert("дb".adjustForward(2) == 2);
185 	assert("\U00010143".adjustForward(0) == 0);
186 	foreach(i; 1 .. 5)
187 		assert("\U00010143".adjustForward(i) == 4);
188 	assert("\U00010143"w.adjustForward(1) == 2);
189 }
190 
191 
192 /**
193 Returns minimum/maximum possible length of string conversion
194 to another Unicode Transformation Format result.
195 */
196 size_t minLength(To, From)(in size_t length) pure nothrow @nogc
197 if(isSomeChar!To && isSomeChar!From)
198 {
199 	static if (To.sizeof <= From.sizeof)
200 		return length; // best case: every character requires ony one code unit
201 
202 	// n / m + !!(n % m) == (n + m-1) / m
203 	else static if (To.sizeof == 4 && From.sizeof == 2)
204 		return (length + 1) / 2; // best case: only surrogate pairs
205 	else static if (To.sizeof == 4 && From.sizeof == 1)
206 		return (length + 3) / 4; // best case: every dchar not in BMP
207 	else static if (To.sizeof == 2 && From.sizeof == 1)
208 		return (length + 2) / 3; // best case: every wchar in top of BMP
209 	else
210 		static assert(0);
211 }
212 
213 /// ditto
214 size_t minLength(To, From)(in From[] str) pure nothrow @nogc
215 { return minLength!(To, From)(str.length); }
216 
217 pure nothrow @nogc unittest
218 {
219 	assert(minLength!char("abc"d) == 3);
220 	assert(minLength!dchar("ab"w) == 1);
221 	assert(minLength!dchar("abc"w) == 2);
222 	assert(minLength!dchar("abcd"w) == 2);
223 	assert(minLength!dchar("abcd") == 1);
224 	assert(minLength!dchar("abcde") == 2);
225 	assert(minLength!dchar("abcdef") == 2);
226 	assert(minLength!dchar("abcdefg") == 2);
227 	assert(minLength!dchar("abcdefgh") == 2);
228 	assert(minLength!dchar("abcdefghi") == 3);
229 	assert(minLength!wchar("abc") == 1);
230 	assert(minLength!wchar("abcd") == 2);
231 	assert(minLength!wchar("abcde") == 2);
232 	assert(minLength!wchar("abcdef") == 2);
233 	assert(minLength!wchar("abcdefg") == 3);
234 }
235 
236 
237 /// ditto
238 size_t maxLength(To, From)(in size_t length) pure nothrow @nogc
239 if(isSomeChar!To && isSomeChar!From)
240 {
241 	static if (To.sizeof >= From.sizeof)
242 		enum k = 1; // worst case: every code unit represents a character
243 	else static if (To.sizeof == 1 && From.sizeof == 2)
244 		enum k = 3; // worst case: every wchar in top of BMP
245 	else static if (To.sizeof == 1 && From.sizeof == 4)
246 		enum k = 4; // worst case: every dchar not in BMP
247 	else static if (To.sizeof == 2 && From.sizeof == 4)
248 		enum k = 2; // worst case: every dchar not in BMP
249 	else
250 		static assert(0);
251 	return length * k;
252 }
253 
254 /// ditto
255 size_t maxLength(To, From)(in From[] str) pure nothrow @nogc
256 { return maxLength!(To, From)(str.length); }
257 
258 pure nothrow @nogc unittest
259 {
260 	assert(maxLength!char("abc") == 3);
261 	assert(maxLength!dchar("abc") == 3);
262 	assert(maxLength!char("abc"w) == 9);
263 	assert(maxLength!char("abc"d) == 12);
264 	assert(maxLength!wchar("abc"d) == 6);
265 }
266 
267 
268 ///
269 pure unittest
270 {
271 	import std.range;
272 	import std.utf;
273 
274 	const str = "abc-ЭЮЯ";
275 	const wlen = toUTF16(str).length;
276 	const dlen = walkLength(str);
277 	assert(wlen >= minLength!wchar(str) && wlen <= maxLength!wchar(str));
278 	assert(dlen >= minLength!dchar(str) && dlen <= maxLength!dchar(str));
279 }
280 
281 
282 /**
283 Copies text from $(D source) to $(D buff) performing conversion
284 to different unicode transformation format if needed.
285 
286 $(D buff) must be large enough to hold the result.
287 
288 Preconditions:
289 $(D buff.length >= minLength!To(source))
290 
291 Returns:
292 Slice of the provided buffer $(D buff) with the copy of $(D source).
293 */
294 To[] copyEncoded(To, From)(in From[] source, To[] buff) @trusted
295 if(isSomeChar!To && isSomeChar!From)
296 in { assert(buff.length >= minLength!To(source)); }
297 body
298 {
299 	static if(is(Unqual!To == Unqual!From))
300 	{
301 		return buff[0 .. source.length] = source[];
302 	}
303 	else
304 	{
305 		To* ptr = buff.ptr;
306 		const To* last = ptr + buff.length;
307 		foreach(dchar dc; source)
308 		{
309 			version(D_NoBoundsChecks) { }
310 			else if(ptr + codeLength!To(dc) > last)
311 				onRangeError();
312 
313 			static if(is(Unqual!To == dchar))
314 				*ptr++ = dc;
315 			else
316 				// Warning: assume `encode` uses only needed bytes.
317 				ptr += encode(*(cast(To[4 / To.sizeof]*) ptr), dc);
318 		}
319 		return buff[0 .. ptr - buff.ptr];
320 	}
321 }
322 
323 ///
324 pure unittest
325 {
326 	const str = "abc-ЭЮЯ";
327 	wchar[100] wsbuff;
328 	assert(copyEncoded(str, wsbuff) == "abc-ЭЮЯ"w);
329 }
330 
331 pure nothrow unittest
332 {
333 	wchar[100] wsbuff;
334 	assert(copyEncoded("abc-ЭЮЯ"w, wsbuff) == "abc-ЭЮЯ"w);
335 }
336 
337 pure unittest
338 {
339 	import std.range;
340 
341 	const str = "abc-ЭЮЯ";
342 	char[100] sbuff;
343 
344 	{
345 		wchar[100] wsbuff;
346 		const strW = toUTF16(str);
347 		assert(copyEncoded(str, wsbuff[0 .. strW.length]) == strW);
348 		assert(copyEncoded(strW, sbuff[0 .. str.length]) == str);
349 	}
350 	{
351 		dchar[100] dsbuff;
352 		const strD = toUTF32(str);
353 		assert(copyEncoded(str, dsbuff[0 .. walkLength(str)]) == strD);
354 		assert(copyEncoded(strD, sbuff[0 .. str.length]) == str);
355 	}
356 }
357 
358 
359 /**
360 Copies as much text from the beginning of $(D source) to $(D buff) as latter can hold
361 performing conversion to different unicode transformation format if needed.
362 
363 $(D source) will be set to its uncopied slice.
364 
365 Returns:
366 Slice of the provided buffer $(D buff) with a (parital) copy of $(D source).
367 
368 Examples:
369 ---
370 import std.array: empty;
371 
372 const(char)[] buff = ...;
373 wchar[n] wbuff = void;
374 while(!buff.empty)
375 	f(buff.copySomeEncoded(wbuff)); // `f` accepts at most `n` wide characters
376 ---
377 */
378 To[] copySomeEncoded(To, From)(ref inout(From)[] source, To[] buff) @trusted pure
379 if(isSomeChar!To && isSomeChar!From)
380 {
381 	static if(is(Unqual!To == Unqual!From))
382 	{
383 		const length = source.length <= buff.length ? source.length : source.adjustBack(buff.length);
384 		auto res = buff[0 .. length] = source[0 .. length];
385 		source = source[length .. $];
386 		return res;
387 	}
388 	else
389 	{
390 		To* ptr = buff.ptr;
391 		const To* last = ptr + buff.length;
392 		size_t end = -1;
393 		foreach(i, dchar dc; source)
394 		{
395 			if(ptr + codeLength!To(dc) > last)
396 			{
397 				end = i;
398 				break;
399 			}
400 
401 			static if(is(Unqual!To == dchar))
402 				*ptr++ = dc;
403 			else
404 				// Warning: assume `encode` uses only needed bytes.
405 				ptr += encode(*(cast(To[4 / To.sizeof]*) ptr), dc);
406 		}
407 		source = source[end == -1 ? $ : end .. $];
408 		return buff[0 .. ptr - buff.ptr];
409 	}
410 }
411 
412 pure unittest
413 {
414 	import std.array: empty;
415 	import unstd.generictuple;
416 
417 	foreach(str; expressionTuple!("abcdef", "zэюяzzэюяzzэюя", "z\U00010143z"))
418 		foreach(f; GenericTuple!(s => s, toUTF16, toUTF32))
419 		{
420 			foreach(n; expressionTuple!(2, 3, 4, 10))
421 			{
422 				auto buff = f(str);
423 				wchar[] allWchars;
424 				wchar[n] wbuff = void;
425 				while(!buff.empty)
426 					allWchars ~= buff.copySomeEncoded(wbuff[]);
427 				assert(allWchars == str.toUTF16());
428 			}
429 		}
430 }