bool isValidUTF8(const char* s) { int left = 0; // how many bytes are left in the current codepoint while (*s) { const unsigned char c = (unsigned char)*(s++); const int ones = leadingOnes(c); if (left) { if (ones != 1) return false; // should be a continuation byte left--; } else { if (ones == 0) continue; // ASCII byte if (ones == 1) return false; // unexpected continuation byte if (c > 0xF4) return false; // codepoint too large (< 0x10FFFF) if (c == 0xC0 || c == 0xC1) return false; // codepoints <= 0x7F shouldn't be 2 bytes // still valid left = ones - 1; } } if (left != 0) return false; // string ended mid-codepoint return true; }
bool utf8_validator::validate(std::ifstream &file_) { if (!file_.good()) { return false; } auto it = std::istreambuf_iterator<char>(file_); const auto END = std::istreambuf_iterator<char>(); while (it != END) { unsigned char c = static_cast<unsigned char>(*it); if (c <= 0x7f) { // ASCII character ++it; continue; } int numOnes = leadingOnes(c); if (numOnes == 1 || numOnes > 4) { // invalid number of leading ones return false; } unsigned int utfChar = c & (0xff >> numOnes); if (utfChar == 0) { // unneccessarily long sequence of bytes (e.g. 11000000 00000001) return false; } // following bytes must start with bits 10 for (int i = 1; i < numOnes; ++i) { utfChar = (utfChar << 6) | (c & 0x3f); if (++it == END || (static_cast<unsigned char>(*it) & 0xc0) != 0x80) { return false; } } if (utfChar > 0x10FFFF) { // too large number according to RFC 3629 return false; } } return true; }
static void insertMaskField(uint32_t *instruction, TR::InstOpCode::Mnemonic op, int64_t lmask) { int32_t encoding; // A mask is is a string of 1 bits surrounded by a string of 0 bits. // For word instructions it is specified through its start and stop bit // numbers. Note - the mask is considered circular so the start bit // number may be greater than the stop bit number. // Examples: input start stop // 00FFFF00 8 23 // 00000001 31 31 // 80000001 31 0 // FFFFFFFF 0 31 (somewhat arbitrary) // 00000000 ? ? (illegal) // // For doubleword instructions only one of the start bit or stop bit is // specified and the other is implicit in the instruction. The bit // number is strangely encoded in that the low order bit 5 comes first // and the high order bits after. The field is in bit positions 21-26. // For these instructions the immediate is not a mask but a 1-bit immediate operand if (op == TR::InstOpCode::cmprb) { // populate 1-bit L field encoding = (((uint32_t)lmask) & 0x1) << 21; *instruction |= encoding; return; } // For these instructions the immediate is not a mask but a 2-bit immediate operand if (op == TR::InstOpCode::xxpermdi || op == TR::InstOpCode::xxsldwi) { encoding = (((uint32_t)lmask) & 0x3) << 8; *instruction |= encoding; return; } if (op == TR::InstOpCode::addex || op == TR::InstOpCode::addex_r) { encoding = (((uint32_t)lmask) & 0x3) << 9; *instruction |= encoding; return; } // For these instructions the immediate is not a mask but a 4-bit immediate operand if (op == TR::InstOpCode::vsldoi) { encoding = (((uint32_t)lmask) & 0xf)<< 6; *instruction |= encoding; return; } TR::InstOpCode opCode(op); if (opCode.isCRLogical()) { encoding = (((uint32_t) lmask) & 0xffffffff); *instruction |= encoding; return; } TR_ASSERT(lmask, "A mask of 0 cannot be encoded"); if (opCode.isDoubleWord()) { int bitnum; if (opCode.useMaskEnd()) { TR_ASSERT(contiguousBits(lmask) && ((lmask & CONSTANT64(0x8000000000000000)) != 0) && ((lmask == -1) || ((lmask & 0x1) == 0)), "Bad doubleword mask for ME encoding"); bitnum = leadingOnes(lmask) - 1; } else { bitnum = leadingZeroes(lmask); // assert on cases like 0xffffff00000000ff TR_ASSERT((bitnum != 0) || (lmask == -1) || ((lmask & 0x1) == 0) || (op!=TR::InstOpCode::rldic && op!=TR::InstOpCode::rldimi && op!=TR::InstOpCode::rldic_r && op!=TR::InstOpCode::rldimi_r), "Cannot handle wrap-around, check mask for correctness"); } encoding = ((bitnum&0x1f)<<6) | ((bitnum&0x20)); } else // single word { // special case the 3-bit rounding mode fields if (op == TR::InstOpCode::drrnd || op == TR::InstOpCode::dqua) { encoding = (lmask << 9) & 0x600; } else { int32_t mask = lmask&0xffffffff; int32_t maskBegin; int32_t maskEnd; maskBegin = leadingZeroes(~mask & (2*mask)); maskBegin = (maskBegin + (maskBegin != 32)) & 0x1f; maskEnd = leadingZeroes(mask & ~(2*mask)); encoding = 32*maskBegin + maskEnd << 1; // shift encrypted mask into position } } *instruction |= encoding; }