C ++, (UTF-8) 971 바이트
#include<cstdint>
using u=uint8_t;using U=uint32_t;U i,o,x,b,m;U R(u*&p){x=*p++;if(!i){m=0;while(128>>m&x)++m;if(m>1)for(x&=127>>m;--m;)x=x<<6|((*p&192)-128?~0:*p++&63);return m?x=~0:x;}else if(i<3){x<<=8;x+=*p++;}else if(i<4){x+=*p++<<8;}else if(i<6){x<<=24;x+=*p++<<16;x+=*p++<<8;x+=*p++;}else{x+=*p++<<8;x+=*p++<<16;x+=*p++<<24;}return x;}U r(u*&p){U x0=R(p);if(i&&i<4&&x>>10==54)x=R(p)>>10==55?(x0<<10)+x-56613888:~0;if(!b++){if(x==65279)if(!i||i%3==1)r(p);else x=~0;else if(x==65534&&i==1)i=3,r(p);else if(x==4294836224&&i==4)i=6,r(p);}return x>1114111||x>>11==27?x=~0:x;}void w(U x,u*&p){if(!o){if(x<128)*p++=x;else{for(m=0;~63<<m&x;m+=6);for(*p++=~127>>m/6|x>>m;m;)*p++=128|x>>(m-=6)&63;}}else if(o<4&&x>65535)x-=65536,w(55296|x>>10,p),w(56320|x&1023,p);else if(o<3)*p++=x>>8,*p++=x;else if(o<4)*p++=x,*p++=x>>8;else if(o<6)*p++=x>>24,*p++=x>>16,*p++=x>>8,*p++=x;else*p++=x,*p++=x>>8,*p++=x>>16,*p++=x>>24;}int t(u*&p,u*&q){for(b=0,x=1;U(x+x);)w(r(p),q);return x;}
아래의 읽을 수있는 프로그램은 다음 Perl 명령을 통해 필터링하여 위의 형식으로 요약 할 수 있습니다.
perl -p0 -e 's!//.*!!g;s/\s+/ /g;s/ \B|\B //g;s/0x[\da-f]+/hex($&)/ige;s/#include<[^<>]+>/\n$&\n/g;s/^\n+//mg'
위의 명령
- 주석을 제거합니다
- 불필요한 공백을 제거합니다
- 16 진 리터럴을 10 진수로 변환
#include
줄 바꿈 개행
읽을 수있는 코드
#include <cstdint>
using u = uint8_t;
using U = uint32_t;
U i, // input encoding
o, // output encoding
x, // last read value
b, // char count(BOM only valid when b==0)
m; // temporary variable for measuring UTF-8
// Encodings:
// 0 UTF-8
// 1 UTF-16
// 2 UTF-16BE
// 3 UTF-16LE
// 4 UTF-32
// 5 UTF-32BE
// 6 UTF-32LE
// Read a character or UTF-16 surrogate
U R(u*& p) {
x = *p++;
if (!i) { // UTF-8
m=0; while (128>>m&x) ++m; // how many bytes?
if (m>1) for (x&=127>>m; --m; ) x = x<<6 | ((*p&192)-128?~0:*p++&63);
return m ? x=~0 : x;
} else if (i<3) { // UTF-16, UTF-16BE
x<<=8; x+=*p++;
} else if (i<4) { // UTF-16LE
x+=*p++<<8;
} else if (i<6) { // UTF-32, UTF-32BE
x<<=24; x+=*p++<<16; x+=*p++<<8; x+=*p++;
} else { // UTF-32LE
x+=*p++<<8; x+=*p++<<16; x+=*p++<<24;
}
return x;
}
// Read a character, combining surrogates, processing BOM, and checking range
U r(u*& p) {
U x0 = R(p);
if (i && i<4 && x>>10==54)
x = R(p)>>10==55 ? (x0<<10)+x-56613888: ~0; // 56613888 == 0xd800<<10 + 0xdc00 - 0x10000
if (!b++) { // first char - is it BOM?
if (x==0xFEFF)
if (!i || i%3==1)
r(p); // BOM in UTF-8 or UTF-16 or UTF-32 - ignore, and read next char
else
x = ~0; // not allowed in these modes
else if (x==0xFFFE && i==1)
i=3,r(p); // reversed BOM in UTF-16 - change to little-endian, and read next char
else if (x==0xFFFE0000 && i==4)
i=6,r(p); // reversed BOM in UTF-32 - change to little-endian, and read next char
}
return x>0x10ffff || x>>11==27 ? x=~0 : x;
}
// Write character(assumed in-range)
void w(U x, u*& p) {
if (!o) { // UTF-8
if (x<128) *p++=x; // ASCII
else {
for (m=0; ~63<<m&x; m+=6); // how many bits?
for (*p++=~127>>m/6|x>>m; m; ) *p++ = 128|x>>(m-=6)&63;
}
} else if (o<4 && x>65535) // UTF-16 surrogate
x-=65536, w(0xD800|x>>10,p), w(0xDC00|x&0x3FF,p);
else if (o<3) // UTF-16, UTF-16BE
*p++=x>>8, *p++=x;
else if (o<4) // UTF-16LE
*p++=x, *p++=x>>8;
else if (o<6) // UTF-32, UTF-32BE
*p++=x>>24, *p++=x>>16, *p++=x>>8, *p++=x;
else // UTF-32LE
*p++=x, *p++=x>>8, *p++=x>>16, *p++=x>>24;
}
// Transcode
int t(u*& p, u*& q) // input, output
{
for (b=0,x=1;U(x+x);) // exit condition is true only for x==-x, i.e. 0 and ~0
w(r(p),q);
return x;
}
호출되는 함수이다 t()
입출력 인코딩 전역 변수에 전달하여, i
그리고 o
각각과 p
입력의 바이트를 가리키는 해야 널 종료 될. q
덮어 될 것이며, 출력 버퍼에 포인트 한다 결과에 대해 충분히 큰 수는 - 버퍼 오버런으로 피하려는 시도가 없다.
코드 주석이 충분히 설명되기를 바랍니다. 아래 중 하나가 너무 암호가 아닌지 물어보십시오 (그러나 먼저 노력하십시오!).
이 답변을 개발하는 동안 실질적인 테스트 스위트를 작성했습니다. 다른 참가자의 이익을 위해 아래에 포함시키고 요구 사항에 대한 해석을 문서화합니다.
테스트 기능
#include <vector>
#include <iostream>
std::ostream& operator<<(std::ostream& out, const std::vector<u>& v)
{
out << "{ ";
for (int i: v) out << i << " ";
out << "}";
return out;
}
int test_read(int encoding, std::vector<u> input, U expected)
{
b = 0;
i = encoding;
auto d = input.data();
U actual = r(d);
if (actual == expected) return 0;
std::cerr << std::hex << "Decoding " << encoding << "; " << input << " gave " << actual
<< " instead of " << expected << std::endl;
return 1;
}
int test_write(int encoding, U input, std::vector<u> expected)
{
o = encoding;
u buf[20], *p = buf;
w(input, p);
std::vector<u> actual(buf,p);
if (expected == actual) return 0;
std::cerr << std::hex << "Encoding " << encoding << "; " << input << " gave " << actual
<< " instead of " << expected << std::endl;
return 1;
}
int test_transcode(int ienc, std::vector<u> input, int oenc, std::vector<u> expected)
{
b = 0;
i = ienc; o = oenc;
u buf[200], *p = buf, *d = input.data();
int result = t(d, p);
std::vector<u> actual(buf,p);
if (result ? expected.empty() : expected == actual) return 0;
std::cerr << std::hex << "Encoding " << ienc << " to " << oenc << "; " << input << " gave " << actual
<< " instead of " << expected << std::endl;
return 1;
}
테스트 스위트
static const U FAIL = ~0;
int main() {
int e = 0; // error count
// UTF-8
e += test_read(0, { 128 }, FAIL); // unexpected continuation
e += test_read(0, { 128, 1 }, FAIL);
e += test_read(0, { 128, 128 }, FAIL);
e += test_read(0, { 192, 192 }, FAIL); // start without continuation
e += test_read(0, { 192, 0 }, FAIL);
e += test_read(0, { 224, 0 }, FAIL);
e += test_read(0, { 224, 192 }, FAIL);
e += test_read(0, { 0xf4, 0x90, 128, 128 }, FAIL); // Unicode maximum+1
e += test_read(0, { 127 }, 127);
e += test_read(0, { 192, 129 }, 1); // We accept overlong UTF-8
e += test_read(0, { 0xc2, 128 }, 128);
e += test_read(0, { 224, 128, 129 }, 1);
e += test_read(0, { 0xef, 128, 128 }, 0xF000);
e += test_read(0, { 0xef, 191, 191 }, 0xFFFF);
e += test_read(0, { 0xf4, 128, 128, 128 }, 0x100000);
e += test_read(0, { 0xf4, 0x8f, 191, 191 }, 0x10FFFF); // Unicode maximum
e += test_read(0, { 0xEF, 0xBB, 0xBF, 127 }, 127); // byte-order mark
e += test_write(0, 0, { 0 });
e += test_write(0, 127, { 127 });
e += test_write(0, 128, { 0xc2, 128 });
e += test_write(0, 255, { 0xc3, 191 });
e += test_write(0, 0xFFFF, { 0xef, 191, 191 });
e += test_write(0, 0x10FFFF, { 0xf4, 0x8f, 191, 191 });
// UTF-16
e += test_read(1, { 0, 1 }, 1);
e += test_read(1, { 0xd8, 0, 0xdc, 1 }, 0x10001);
e += test_read(1, { 0xdb, 0xff, 0xdf, 0xff }, 0x10ffff);
e += test_read(1, { 0xd8, 0, 0xd8, 1 }, FAIL); // mismatched surrogate
e += test_read(1, { 0xd8, 0, 0, 1 }, FAIL); // mismatched surrogate
e += test_read(1, { 0xdc, 0 }, FAIL);
e += test_write(1, 1, { 0, 1 });
e += test_write(1, 256, { 1, 0 });
e += test_write(1, 0xffff, { 255, 255 });
e += test_write(1, 0x10001, { 0xd8, 0, 0xdc, 1 });
e += test_write(1, 0x10ffff, { 0xdb, 0xff, 0xdf, 0xff });
// UTF-16LE
e += test_write(3, 1, { 1, 0 });
e += test_write(3, 256, { 0, 1 });
e += test_write(3, 0x10001, { 0, 0xd8, 1, 0xdc });
e += test_write(3, 0x10fffe, { 0xff, 0xdb, 0xfe, 0xdf });
// UTF-16 byte-order mark
e += test_read(1, { 0xFE, 0xFF, 0x0, 1 }, 1); // byte-order mark
e += test_read(1, { 0xFF, 0xFE, 1, 0x0 }, 1); // reversed byte-order mark
// disallowed byte-order marks
e += test_read(2, { 0xFE, 0xFF }, FAIL);
e += test_read(3, { 0xFF, 0xFE }, FAIL);
// reversed byte-order mark is an unassigned character - to be treated like regular character, according to question
e += test_read(2, { 0xFF, 0xFE }, 0xfffe);
e += test_read(3, { 0xFE, 0xFF }, 0xfffe);
// UTF-32
e += test_read(4, { 0, 0, 0, 1 }, 1);
e += test_read(4, { 1, 0, 0, 0 }, FAIL);
e += test_write(4, 1, { 0, 0, 0, 1 });
e += test_write(4, 0x10203, { 0, 1, 2, 3 });
// UTF-32LE
e += test_read(6, { 0, 0, 0, 1 }, FAIL);
e += test_read(6, { 1, 0, 0, 0 }, 1);
// UTF-32 byte-order mark
e += test_read(4, { 0, 0, 0xFE, 0xFF, 0, 0, 0, 1 }, 1); // byte-order mark
e += test_read(4, { 0xFF, 0xFE, 0, 0, 1, 0, 0, 0 }, 1); // reversed byte-order mark
// disallowed byte-order marks
e += test_read(5, { 0, 0, 0xFE, 0xFF }, FAIL);
e += test_read(5, { 0xFF, 0xFE, 0, 0 }, FAIL);
e += test_read(6, { 0, 0, 0xFE, 0xFF }, FAIL);
e += test_read(6, { 0xFF, 0xFE, 0, 0 }, FAIL);
e += test_transcode(1, { 1, 2, 0xFE, 0xFF, 0, 0 }, // That's not a BOM; it's a zwnj when not the first char
1, { 1, 2, 0xFE, 0xFF, 0, 0 });
e += test_transcode(1, { 0xFF, 0xFE, 1, 2, 0, 0 }, // reversed byte-order mark implies little-endian
1, { 2, 1, 0, 0 });
e += test_transcode(4, { 0xFF, 0xFE, 0, 0, 1, 2, 0, 0, 0, 0 }, // reversed BOM means little-endian
4, { 0, 0, 2, 1, 0, 0, 0, 0 });
e += test_transcode(1, { 0xdb, 0xff, 0xdf, 0xff, 0, 0 }, // U+10ffff UTF-16 to UTF-8
0, { 0xf4, 0x8f, 191, 191, 0 });
return e;
}
''⎕R''⍠'InEnc' 'UTF16BE' 'OutEnc' 'UTF8-BOM'
.