Exemplo n.º 1
0
bool isValidUTF8(const char* s) {
    int left = 0;  // how many bytes are left in the current codepoint
    while (*s) {
        const unsigned char c = (unsigned char)*(s++);
        const int ones = leadingOnes(c);
        if (left) {
            if (ones != 1)
                return false;  // should be a continuation byte
            left--;
        } else {
            if (ones == 0)
                continue;  // ASCII byte
            if (ones == 1)
                return false;  // unexpected continuation byte
            if (c > 0xF4)
                return false;  // codepoint too large (< 0x10FFFF)
            if (c == 0xC0 || c == 0xC1)
                return false;  // codepoints <= 0x7F shouldn't be 2 bytes

            // still valid
            left = ones - 1;
        }
    }
    if (left != 0)
        return false;  // string ended mid-codepoint
    return true;
}
Exemplo n.º 2
0
bool utf8_validator::validate(std::ifstream &file_)
{
    if (!file_.good())
    {
        return false;
    }

    auto it = std::istreambuf_iterator<char>(file_);
    const auto END = std::istreambuf_iterator<char>();

    while (it != END)
    {
        unsigned char c = static_cast<unsigned char>(*it);

        if (c <= 0x7f)
        { // ASCII character
            ++it;
            continue;
        }

        int numOnes = leadingOnes(c);
        if (numOnes == 1 || numOnes > 4)
        { // invalid number of leading ones
            return false;
        }

        unsigned int utfChar = c & (0xff >> numOnes);
        if (utfChar == 0)
        { // unneccessarily long sequence of bytes (e.g. 11000000 00000001)
            return false;
        }

        // following bytes must start with bits 10
        for (int i = 1; i < numOnes; ++i)
        {
            utfChar = (utfChar << 6) | (c & 0x3f);
            if (++it == END || (static_cast<unsigned char>(*it) & 0xc0) != 0x80)
            {
                return false;
            }
        }

        if (utfChar > 0x10FFFF)
        { // too large number according to RFC 3629
            return false;
        }
    }

    return true;
}
Exemplo n.º 3
0
static void insertMaskField(uint32_t *instruction, TR::InstOpCode::Mnemonic op, int64_t lmask)
   {
   int32_t encoding;
   // A mask is is a string of 1 bits surrounded by a string of 0 bits.
   // For word instructions it is specified through its start and stop bit
   // numbers.  Note - the mask is considered circular so the start bit
   // number may be greater than the stop bit number.
   // Examples:     input     start   stop
   //              00FFFF00      8     23
   //              00000001     31     31
   //              80000001     31      0
   //              FFFFFFFF      0     31  (somewhat arbitrary)
   //              00000000      ?      ?  (illegal)
   //
   // For doubleword instructions only one of the start bit or stop bit is
   // specified and the other is implicit in the instruction.  The bit
   // number is strangely encoded in that the low order bit 5 comes first
   // and the high order bits after.  The field is in bit positions 21-26.

   // For these instructions the immediate is not a mask but a 1-bit immediate operand
   if (op == TR::InstOpCode::cmprb)
      {
      // populate 1-bit L field
      encoding = (((uint32_t)lmask) & 0x1) << 21;
      *instruction |= encoding;
      return;
      }

   // For these instructions the immediate is not a mask but a 2-bit immediate operand
   if (op == TR::InstOpCode::xxpermdi ||
       op == TR::InstOpCode::xxsldwi)
      {
      encoding = (((uint32_t)lmask) & 0x3) << 8;
      *instruction |= encoding;
      return;
      }

   if (op == TR::InstOpCode::addex ||
       op == TR::InstOpCode::addex_r)
      {
      encoding = (((uint32_t)lmask) & 0x3) << 9;
      *instruction |= encoding;
      return;
      }

   // For these instructions the immediate is not a mask but a 4-bit immediate operand
   if (op == TR::InstOpCode::vsldoi)
      {
      encoding = (((uint32_t)lmask) & 0xf)<< 6;
      *instruction |= encoding;
      return;
      }

   TR::InstOpCode       opCode(op);

   if (opCode.isCRLogical())
      {
      encoding = (((uint32_t) lmask) & 0xffffffff);
      *instruction |= encoding;
      return;
      }

   TR_ASSERT(lmask, "A mask of 0 cannot be encoded");   

   if (opCode.isDoubleWord())
      {
      int bitnum;

      if (opCode.useMaskEnd())
	 {
         TR_ASSERT(contiguousBits(lmask) &&
		((lmask & CONSTANT64(0x8000000000000000)) != 0) &&
		((lmask == -1) || ((lmask & 0x1) == 0)),
		"Bad doubleword mask for ME encoding");
         bitnum = leadingOnes(lmask) - 1;
	 }
      else
	 {
         bitnum = leadingZeroes(lmask);
	 // assert on cases like 0xffffff00000000ff
         TR_ASSERT((bitnum != 0) || (lmask == -1) || ((lmask & 0x1) == 0) ||
                             (op!=TR::InstOpCode::rldic   &&
                              op!=TR::InstOpCode::rldimi  &&
                              op!=TR::InstOpCode::rldic_r &&
                              op!=TR::InstOpCode::rldimi_r),
                "Cannot handle wrap-around, check mask for correctness");
	 }
      encoding = ((bitnum&0x1f)<<6) | ((bitnum&0x20));

      }
   else // single word
      {
      // special case the 3-bit rounding mode fields
      if (op == TR::InstOpCode::drrnd || op == TR::InstOpCode::dqua)
         {
         encoding = (lmask << 9) & 0x600;
         }
      else
         {
         int32_t mask = lmask&0xffffffff;
         int32_t maskBegin;
         int32_t maskEnd;

         maskBegin = leadingZeroes(~mask & (2*mask));
         maskBegin = (maskBegin + (maskBegin != 32)) & 0x1f;
         maskEnd  = leadingZeroes(mask & ~(2*mask));
         encoding = 32*maskBegin + maskEnd << 1; // shift encrypted mask into position
         }
      }
   *instruction |= encoding;
   }