// We only use U8_* macros, which are entirely inline. #include"unicode/utf8.h"
// This contains a codepage and ISO 14882:1998 illegality table. // Use "make gen-table" to rebuild it. #include"cptbl.h"
/** * What is this? * * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code * in utf-8 into something consumable by certain compilers (Solaris, xlC) * which aren't quite standards compliant. * * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN' * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc. * (some compilers do not support the u8 prefix correctly.) * - if the system is EBCDIC-based, that is used to correct the input characters. * * Usage: * escapesrc infile.cpp outfile.cpp * Normally this is invoked by the build stage, with a rule such as: * * _%.cpp: $(srcdir)/%.cpp * @$(BINDIR)/escapesrc$(EXEEXT) $< $@ * %.o: _%.cpp * $(COMPILE.cc) ... $@ $< * * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp * from being itself escaped.
*/
// For convenience # define cp1047_to_8859(c) cp1047_8859_1[c]
// Our app's name
std::string prog;
/** * Give the usual 1-line documentation and exit
*/ void usage() {
fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str());
}
/** * Delete the output file (if any) * We want to delete even if we didn't generate, because it might be stale.
*/ int cleanup(const std::string &outfile) { constchar *outstr = outfile.c_str(); if(outstr && *outstr) { int rc = std::remove(outstr); if(rc == 0) {
fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr); return 0;
} else { if( errno == ENOENT ) { return 0; // File did not exist - no error.
} else {
perror("std::remove"); return 1;
}
}
} return 0;
}
/** * Skip across any known whitespace. * @param p startpoint * @param e limit * @return first non-whitespace char
*/ inlineconstchar *skipws(constchar *p, constchar *e) { for(;p<e;p++) { switch(*p) { case kSPACE: case kTAB: case kLF: case kCR: break; default: return p; // non ws
}
} return p;
}
/** * Append a byte, hex encoded * @param outstr sstring to append to * @param byte the byte to append
*/ void appendByte(std::string &outstr,
uint8_t byte) { char tmp2[5];
snprintf(tmp2, sizeof(tmp2), "\\x%02X", 0xFF & static_cast<int>(byte));
outstr += tmp2;
}
/** * Append the bytes from 'linestr' into outstr, with escaping * @param outstr the output buffer * @param linestr the input buffer * @param pos in/out: the current char under consideration * @param chars the number of chars to consider * @return true on failure
*/ bool appendUtf8(std::string &outstr, const std::string &linestr,
size_t &pos,
size_t chars) { char tmp[9]; for(size_t i=0;i<chars;i++) {
tmp[i] = linestr[++pos];
}
tmp[chars] = 0; unsignedint c;
sscanf(tmp, "%X", &c);
UChar32 ch = c & 0x1FFFFF;
// now to append \\x%% etc
uint8_t bytesNeeded = U8_LENGTH(ch); if(bytesNeeded == 0) {
fprintf(stderr, "Illegal code point U+%X\n", ch); returntrue;
}
uint8_t bytes[4];
uint8_t *s = bytes;
size_t i = 0;
U8_APPEND_UNSAFE(s, i, ch); for(size_t t = 0; t<i; t++) {
appendByte(outstr, s[t]);
} returnfalse;
}
/** * fix the u"x"/u'x'/u8"x" string at the position * u8'x' is not supported, sorry. * @param linestr the input string * @param pos the position * @return false = no err, true = had err
*/ bool fixAt(std::string &linestr, size_t pos) {
size_t origpos = pos;
if(linestr[pos] != 'u') {
fprintf(stderr, "Not a 'u'?"); returntrue;
}
for(; pos < linestr.size(); pos++) { if(linestr[pos] == quote) { if(utf8) { return fixu8(linestr, origpos, pos); // fix u8"..."
} else { returnfalse; // end of quote
}
} if(linestr[pos] == '\\') {
pos++; if(linestr[pos] == quote) continue; // quoted quote if(linestr[pos] == 'u') continue; // for now ... unicode escape if(linestr[pos] == '\\') continue; // some other escape… ignore
} else {
size_t old_pos = pos;
int32_t i = pos; #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) // mogrify 1-4 bytes from 1047 'back' to utf-8 char old_byte = linestr[pos];
linestr[pos] = cp1047_to_8859(linestr[pos]); // how many more?
int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]); for(size_t pos2 = pos+1; trail>0; pos2++,trail--) {
linestr[pos2] = cp1047_to_8859(linestr[pos2]); if(linestr[pos2] == 0x0A) {
linestr[pos2] = 0x85; // NL is ambiguous here
}
} #endif
// Proceed to decode utf-8 const uint8_t* s = reinterpret_cast<const uint8_t*>(linestr.c_str());
int32_t length = linestr.size();
UChar32 c; if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) { #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
linestr[pos] = old_byte; // put it back #endif continue; // single code point not previously legal for \u escaping
}
// otherwise, convert it to \u / \U
{
U8_NEXT(s, i, length, c);
} if(c<0) {
fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", static_cast<int>(old_pos));
fprintf(stderr, "Line: >>%s<<\n", linestr.c_str()); returntrue;
}
/** * Fixup an entire line * false = no err * true = had err * @param no the line number (not used) * @param linestr the string to fix * @return true if any err, else false
*/ bool fixLine(int/*no*/, std::string &linestr) { constchar *line = linestr.c_str();
size_t len = linestr.size();
// no u' in the line? if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) { returnfalse; // Nothing to do. No u' or u" detected
}
// start from the end and find all u" cases
size_t pos = len = linestr.size(); if(len>INT32_MAX/2) { returntrue;
} while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) { //printf("found doublequote at %d\n", pos); if(fixAt(linestr, pos)) returntrue; if(pos == 0) break;
pos--;
}
// reset and find all u' cases
pos = len = linestr.size(); while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) { //printf("found singlequote at %d\n", pos); if(fixAt(linestr, pos)) returntrue; if(pos == 0) break;
pos--;
}
// reset and find all u8" cases
pos = len = linestr.size(); while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) { if(fixAt(linestr, pos)) returntrue; if(pos == 0) break;
pos--;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.