1 /** Additions to $(STDMODULE _utf). 2 3 Copyright: Denis Shelomovskij 2012-2013 4 5 License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0). 6 7 Authors: Denis Shelomovskij 8 */ 9 module unstd.utf; 10 11 12 version(D_NoBoundsChecks) { } 13 else import core.exception; 14 15 import std.traits; 16 public import std.utf; 17 18 19 @safe: 20 21 /// Detect whether $(D c) is a UTF-8 continuation byte. 22 bool isContinuationByte(in char c) pure nothrow @nogc 23 { 24 return (c & 0xC0) == 0x80; 25 } 26 27 /// Detect whether $(D c) is a UTF-16 lead/trail surrogate or not a surrogate. 28 bool isLeadSurrogate(in wchar c) pure nothrow @nogc 29 { 30 return c >= 0xD800 && c < 0xDC00; 31 } 32 33 /// ditto 34 bool isTrailSurrogate(in wchar c) pure nothrow @nogc 35 { 36 return c >= 0xDC00 && c < 0xE000; 37 } 38 39 /// ditto 40 bool isValidBMPCharacter(in wchar c) pure nothrow @nogc 41 { 42 return c < 0xD800 || c >= 0xE000; 43 } 44 45 pure nothrow @nogc unittest 46 { 47 import unstd.generictuple; 48 49 foreach(c; "zд"w) 50 { 51 assert(!isLeadSurrogate(c)); 52 assert(!isTrailSurrogate(c)); 53 assert( isValidBMPCharacter(c)); 54 } 55 56 foreach(i, c; "\U00010143"w) 57 { 58 assert(isLeadSurrogate(c) == !i); 59 assert(isTrailSurrogate(c) == i); 60 assert(!isValidBMPCharacter(c)); 61 } 62 } 63 64 65 /// Detect whether $(D c) is the first code unit in a sequence. 66 bool isSequenceStart(C)(in C c) pure nothrow @nogc 67 if(isSomeChar!C) 68 { 69 static if(is(C : char)) 70 return !isContinuationByte(c); 71 else static if(is(C : wchar)) 72 return !isTrailSurrogate(c); 73 else static if(is(C : dchar)) 74 return true; // Always true 75 else 76 static assert(0); 77 } 78 79 pure nothrow @nogc unittest 80 { 81 import unstd.generictuple; 82 83 foreach(str; expressionTuple!("a", "a"w, "a"d, "д", "д"w, "д"d, "\U00010143"w, "\U00010143"d)) 84 assert(isSequenceStart(str[0])); 85 86 assert(!isSequenceStart("д"[1])); 87 assert(!isSequenceStart("\U00010143"w[1])); 88 } 89 90 91 /** 92 Adjust $(D idx) to point at the start of a UTF sequence or 93 at the end of $(D str). 94 */ 95 size_t adjustBack(C)(in C[] str, size_t idx) pure nothrow @nogc 96 if(isSomeChar!C) 97 in { assert(idx <= str.length); } 98 body 99 { 100 static if(is(C : char)) 101 { 102 if(idx != str.length) 103 { 104 foreach(_; 0 .. 4 - 1) // Don't expect 5 and 6 byte combinations 105 { 106 if(!isContinuationByte(str[idx])) 107 return idx; 108 assert(idx, "String starts from UTF-8 continuation byte."); 109 --idx; 110 } 111 assert(!isContinuationByte(str[idx]), "UTF-8 sequence length exceeds 4 bytes."); 112 } 113 } 114 else static if(is(C : wchar)) 115 { 116 if(idx != str.length && isTrailSurrogate(str[idx])) 117 { 118 assert(idx, "String starts from UTF-16 trail surrogate."); 119 --idx; 120 assert(isLeadSurrogate(str[idx]), "UTF-16 lead surrogate expected before trail surrogate."); 121 } 122 } 123 else 124 { 125 static assert(is(C : dchar)); 126 } 127 128 return idx; 129 } 130 131 pure nothrow @nogc unittest 132 { 133 assert("a".adjustBack(0) == 0); 134 assert("a".adjustBack(1) == 1); 135 assert("ab".adjustBack(1) == 1); 136 assert("д".adjustBack(1) == 0); 137 assert("дb".adjustBack(2) == 2); 138 foreach(i; 0 .. 4) 139 assert("\U00010143".adjustBack(i) == 0); 140 assert("\U00010143".adjustBack(4) == 4); 141 assert("\U00010143"w.adjustBack(1) == 0); 142 } 143 144 /// ditto 145 size_t adjustForward(C)(in C[] str, size_t idx) pure nothrow @nogc 146 in { assert(idx <= str.length); } 147 body 148 { 149 static if(is(C : char)) 150 { 151 if(idx != str.length) 152 { 153 foreach(_; 0 .. 4 - 1) // Don't expect 5 and 6 byte combinations 154 { 155 if(idx == str.length || !isContinuationByte(str[idx])) 156 return idx; 157 ++idx; 158 } 159 assert(idx == str.length || !isContinuationByte(str[idx]), "UTF-8 sequence length exceeds 4 bytes."); 160 } 161 } 162 else static if(is(C : wchar)) 163 { 164 if(idx != str.length && isTrailSurrogate(str[idx])) 165 { 166 ++idx; 167 assert(idx == str.length || isLeadSurrogate(str[idx]), "UTF-16 lead surrogate expected after trail surrogate."); 168 } 169 } 170 else 171 { 172 static assert(is(C : dchar)); 173 } 174 175 return idx; 176 } 177 178 pure nothrow @nogc unittest 179 { 180 assert("a".adjustForward(0) == 0); 181 assert("a".adjustForward(1) == 1); 182 assert("ab".adjustForward(1) == 1); 183 assert("д".adjustForward(1) == 2); 184 assert("дb".adjustForward(2) == 2); 185 assert("\U00010143".adjustForward(0) == 0); 186 foreach(i; 1 .. 5) 187 assert("\U00010143".adjustForward(i) == 4); 188 assert("\U00010143"w.adjustForward(1) == 2); 189 } 190 191 192 /** 193 Returns minimum/maximum possible length of string conversion 194 to another Unicode Transformation Format result. 195 */ 196 size_t minLength(To, From)(in size_t length) pure nothrow @nogc 197 if(isSomeChar!To && isSomeChar!From) 198 { 199 static if (To.sizeof <= From.sizeof) 200 return length; // best case: every character requires ony one code unit 201 202 // n / m + !!(n % m) == (n + m-1) / m 203 else static if (To.sizeof == 4 && From.sizeof == 2) 204 return (length + 1) / 2; // best case: only surrogate pairs 205 else static if (To.sizeof == 4 && From.sizeof == 1) 206 return (length + 3) / 4; // best case: every dchar not in BMP 207 else static if (To.sizeof == 2 && From.sizeof == 1) 208 return (length + 2) / 3; // best case: every wchar in top of BMP 209 else 210 static assert(0); 211 } 212 213 /// ditto 214 size_t minLength(To, From)(in From[] str) pure nothrow @nogc 215 { return minLength!(To, From)(str.length); } 216 217 pure nothrow @nogc unittest 218 { 219 assert(minLength!char("abc"d) == 3); 220 assert(minLength!dchar("ab"w) == 1); 221 assert(minLength!dchar("abc"w) == 2); 222 assert(minLength!dchar("abcd"w) == 2); 223 assert(minLength!dchar("abcd") == 1); 224 assert(minLength!dchar("abcde") == 2); 225 assert(minLength!dchar("abcdef") == 2); 226 assert(minLength!dchar("abcdefg") == 2); 227 assert(minLength!dchar("abcdefgh") == 2); 228 assert(minLength!dchar("abcdefghi") == 3); 229 assert(minLength!wchar("abc") == 1); 230 assert(minLength!wchar("abcd") == 2); 231 assert(minLength!wchar("abcde") == 2); 232 assert(minLength!wchar("abcdef") == 2); 233 assert(minLength!wchar("abcdefg") == 3); 234 } 235 236 237 /// ditto 238 size_t maxLength(To, From)(in size_t length) pure nothrow @nogc 239 if(isSomeChar!To && isSomeChar!From) 240 { 241 static if (To.sizeof >= From.sizeof) 242 enum k = 1; // worst case: every code unit represents a character 243 else static if (To.sizeof == 1 && From.sizeof == 2) 244 enum k = 3; // worst case: every wchar in top of BMP 245 else static if (To.sizeof == 1 && From.sizeof == 4) 246 enum k = 4; // worst case: every dchar not in BMP 247 else static if (To.sizeof == 2 && From.sizeof == 4) 248 enum k = 2; // worst case: every dchar not in BMP 249 else 250 static assert(0); 251 return length * k; 252 } 253 254 /// ditto 255 size_t maxLength(To, From)(in From[] str) pure nothrow @nogc 256 { return maxLength!(To, From)(str.length); } 257 258 pure nothrow @nogc unittest 259 { 260 assert(maxLength!char("abc") == 3); 261 assert(maxLength!dchar("abc") == 3); 262 assert(maxLength!char("abc"w) == 9); 263 assert(maxLength!char("abc"d) == 12); 264 assert(maxLength!wchar("abc"d) == 6); 265 } 266 267 268 /// 269 pure unittest 270 { 271 import std.range; 272 import std.utf; 273 274 const str = "abc-ЭЮЯ"; 275 const wlen = toUTF16(str).length; 276 const dlen = walkLength(str); 277 assert(wlen >= minLength!wchar(str) && wlen <= maxLength!wchar(str)); 278 assert(dlen >= minLength!dchar(str) && dlen <= maxLength!dchar(str)); 279 } 280 281 282 /** 283 Copies text from $(D source) to $(D buff) performing conversion 284 to different unicode transformation format if needed. 285 286 $(D buff) must be large enough to hold the result. 287 288 Preconditions: 289 $(D buff.length >= minLength!To(source)) 290 291 Returns: 292 Slice of the provided buffer $(D buff) with the copy of $(D source). 293 */ 294 To[] copyEncoded(To, From)(in From[] source, To[] buff) @trusted 295 if(isSomeChar!To && isSomeChar!From) 296 in { assert(buff.length >= minLength!To(source)); } 297 body 298 { 299 static if(is(Unqual!To == Unqual!From)) 300 { 301 return buff[0 .. source.length] = source[]; 302 } 303 else 304 { 305 To* ptr = buff.ptr; 306 const To* last = ptr + buff.length; 307 foreach(dchar dc; source) 308 { 309 version(D_NoBoundsChecks) { } 310 else if(ptr + codeLength!To(dc) > last) 311 onRangeError(); 312 313 static if(is(Unqual!To == dchar)) 314 *ptr++ = dc; 315 else 316 // Warning: assume `encode` uses only needed bytes. 317 ptr += encode(*(cast(To[4 / To.sizeof]*) ptr), dc); 318 } 319 return buff[0 .. ptr - buff.ptr]; 320 } 321 } 322 323 /// 324 pure unittest 325 { 326 const str = "abc-ЭЮЯ"; 327 wchar[100] wsbuff; 328 assert(copyEncoded(str, wsbuff) == "abc-ЭЮЯ"w); 329 } 330 331 pure nothrow unittest 332 { 333 wchar[100] wsbuff; 334 assert(copyEncoded("abc-ЭЮЯ"w, wsbuff) == "abc-ЭЮЯ"w); 335 } 336 337 pure unittest 338 { 339 import std.range; 340 341 const str = "abc-ЭЮЯ"; 342 char[100] sbuff; 343 344 { 345 wchar[100] wsbuff; 346 const strW = toUTF16(str); 347 assert(copyEncoded(str, wsbuff[0 .. strW.length]) == strW); 348 assert(copyEncoded(strW, sbuff[0 .. str.length]) == str); 349 } 350 { 351 dchar[100] dsbuff; 352 const strD = toUTF32(str); 353 assert(copyEncoded(str, dsbuff[0 .. walkLength(str)]) == strD); 354 assert(copyEncoded(strD, sbuff[0 .. str.length]) == str); 355 } 356 } 357 358 359 /** 360 Copies as much text from the beginning of $(D source) to $(D buff) as latter can hold 361 performing conversion to different unicode transformation format if needed. 362 363 $(D source) will be set to its uncopied slice. 364 365 Returns: 366 Slice of the provided buffer $(D buff) with a (parital) copy of $(D source). 367 368 Examples: 369 --- 370 import std.array: empty; 371 372 const(char)[] buff = ...; 373 wchar[n] wbuff = void; 374 while(!buff.empty) 375 f(buff.copySomeEncoded(wbuff)); // `f` accepts at most `n` wide characters 376 --- 377 */ 378 To[] copySomeEncoded(To, From)(ref inout(From)[] source, To[] buff) @trusted pure 379 if(isSomeChar!To && isSomeChar!From) 380 { 381 static if(is(Unqual!To == Unqual!From)) 382 { 383 const length = source.length <= buff.length ? source.length : source.adjustBack(buff.length); 384 auto res = buff[0 .. length] = source[0 .. length]; 385 source = source[length .. $]; 386 return res; 387 } 388 else 389 { 390 To* ptr = buff.ptr; 391 const To* last = ptr + buff.length; 392 size_t end = -1; 393 foreach(i, dchar dc; source) 394 { 395 if(ptr + codeLength!To(dc) > last) 396 { 397 end = i; 398 break; 399 } 400 401 static if(is(Unqual!To == dchar)) 402 *ptr++ = dc; 403 else 404 // Warning: assume `encode` uses only needed bytes. 405 ptr += encode(*(cast(To[4 / To.sizeof]*) ptr), dc); 406 } 407 source = source[end == -1 ? $ : end .. $]; 408 return buff[0 .. ptr - buff.ptr]; 409 } 410 } 411 412 pure unittest 413 { 414 import std.array: empty; 415 import unstd.generictuple; 416 417 foreach(str; expressionTuple!("abcdef", "zэюяzzэюяzzэюя", "z\U00010143z")) 418 foreach(f; GenericTuple!(s => s, toUTF16, toUTF32)) 419 { 420 foreach(n; expressionTuple!(2, 3, 4, 10)) 421 { 422 auto buff = f(str); 423 wchar[] allWchars; 424 wchar[n] wbuff = void; 425 while(!buff.empty) 426 allWchars ~= buff.copySomeEncoded(wbuff[]); 427 assert(allWchars == str.toUTF16()); 428 } 429 } 430 }