如何将字符串转换或解码为VB6中可读的格式?

问题描述 投票:0回答:1

下面的字符串是请求json文件的结果。

StrResult ="\u00D8\u00B3\u00D9\u0084\u00D8\u00A7\u00D9\u0085 
\u00D8\u00AF\u00D9\u0086\u00DB\u008C\u00D8\u00A7"

如何将这个字符串转换为可读字符? 有很多其他语言的示例代码,包括Python、.Net等,但我找不到VB6的任何代码。如果您知道的话,谢谢您。这对我来说至关重要。

提示:解码后应接收到的字符串是“มรรรร”,其对应的英文是“Hello World”。

json encoding vb6 escaping
1个回答
0
投票

您提供的字符串不会解码为“?????”,而是解码为“سÙا٠Ø̄ÙÛا”。您可以在此处确认。

您的字符串实际包含的是单独的 UTF-8 字节,而不是 Unicode 代码点。这使您的任务变得更加困难,因为 VB6 字符串通常在内存中以 UTF-16 编码。

我最近开发了一个库,其中包含用于 VBA 的广泛字符串功能,但我认为有关转义和取消转义 Unicode 文字的部分应该可以像 VB6 代码一样工作。您可以在 GitHub here 上找到整个库,但我可以在此处直接包含应该解决您问题的部分。

使用下面我提供的库中的函数,您应该能够实现您想要的结果,如下所示:

StrResult = DecodeUTF8(EncodeANSI(UnescapeUnicode(StrResult)))

这些是所需的功能:

Option Explicit

Public Enum UnicodeEscapeFormat
    [_efNone] = 0
    efPython = 1 '\uXXXX \u00XXXXXX (4 or 8 hex digits, 8 for chars outside BMP)
    efRust = 2   '\u{X} \U{XXXXXX}  (1 to 6 hex digits)
    efUPlus = 4  'u+XXXX u+XXXXXX   (4 or 6 hex digits)
    efMarkup = 8 '&#ddddddd;        (1 to 7 decimal digits)
    efAll = 15
    [_efMin] = efPython
    [_efMax] = efAll
End Enum

Private Type EscapeSequence
    ueFormat As UnicodeEscapeFormat
    ueSignature As String
    letSngSurrogate As Boolean
    buffPosition As Long
    currPosition As Long
    sigSize As Long
    escSize As Long
    codepoint As Long
    unEscSize As Long
End Type
Private Type TwoCharTemplate
    s As String * 2
End Type
Private Type LongTemplate
    l As Long
End Type

'Replaces all occurences of unicode characters outside the codePoint range
'defined by maxNonEscapedCharCode with literals of the following formats
'specified by `escapeFormat`:
' efPython = 1 ... \uXXXX \u00XXXXXX   (4 or 8 hex digits, 8 for chars outside BMP)
' efRust   = 2 ... \u{XXXX} \U{XXXXXX} (1 to 6 hex digits)
' efUPlus  = 4 ... u+XXXX u+XXXXXX     (4 or 6 hex digits)
' efMarkup = 8 ... &#ddddddd;          (1 to 7 decimal digits)
'Where:
'   - prefixes \u is case insensitive
'   - Xes are the digits of the codepoint in hexadecimal. (X = 0-9 or A-F/a-f)
'Note:
'   - Avoid u+XXXX syntax if string contains literals without delimiters as it
'     can be misinterpreted if adjacent to text starting with 0-9 or a-f.
'   - This function accepts all combinations of UnicodeEscapeFormats:
'     If called with, e.g. `escapeFormat = efRust Or efPython`, every character
'     in the scope will be escaped with in either format, efRust or efPython,
'     chosen at random for each replacement.
'   - If `escapeFormat` is set to efAll, it will replace every character in the
'     scope with a randomly chosen format of all available fotrmats.
'   - To escape every character, set `maxNonEscapedCharCode = -1`
Public Function EscapeUnicode(ByRef str As String, _
                     Optional ByVal maxNonEscapedCharCode As Long = &HFF, _
                     Optional ByVal escapeFormat As UnicodeEscapeFormat _
                                                = efPython) As String
    Const methodName As String = "EscapeUnicode"
    If maxNonEscapedCharCode < -1 Then Err.Raise 5, methodName, _
        "`maxNonEscapedCharCode` must be greater or equal -1."
    If escapeFormat < [_efMin] Or escapeFormat > [_efMax] Then _
        Err.Raise 5, methodName, "Invalid escape type."
    If Len(str) = 0 Then Exit Function
    Dim i As Long
    Dim j As Long:                j = 1
    Dim result() As String:       ReDim result(1 To Len(str))
    Dim copyChunkSize As Long
    Dim rndEscapeFormat As Boolean
    rndEscapeFormat = ((escapeFormat And (escapeFormat - 1)) <> 0) 'eFmt <> 2^n
    Dim numescapeFormats As Long
    If rndEscapeFormat Then
        Dim escapeFormats() As Long
        For i = 0 To (Log(efAll + 1) / Log(2)) - 1
            If 2 ^ i And escapeFormat Then
                ReDim Preserve escapeFormats(0 To numescapeFormats)
                escapeFormats(numescapeFormats) = 2 ^ i
                numescapeFormats = numescapeFormats + 1
            End If
        Next i
    End If
    For i = 1 To Len(str)
        Dim codepoint As Long: codepoint = AscU(Mid$(str, i, 2))
        If codepoint > maxNonEscapedCharCode Then
            If copyChunkSize > 0 Then
                result(j) = Mid$(str, i - copyChunkSize, copyChunkSize)
                copyChunkSize = 0
                j = j + 1
            End If
            If rndEscapeFormat Then
                escapeFormat = escapeFormats(Int(numescapeFormats * Rnd))
            End If
            Select Case escapeFormat
                Case efPython
                    If codepoint > &HFFFF& Then 'Outside BMP
                        result(j) = "\u" & "00" & Right$("0" & Hex(codepoint), 6)
                    Else 'BMP
                        result(j) = "\u" & Right$("000" & Hex(codepoint), 4)
                    End If
                Case efRust
                    result(j) = "\u{" & Hex(codepoint) & "}"
                Case efUPlus
                    If codepoint < &H1000& Then
                        result(j) = "u+" & Right$("000" & Hex(codepoint), 4)
                    Else
                        result(j) = "u+" & Hex(codepoint)
                    End If
                Case efMarkup
                    result(j) = "&#" & codepoint & ";"
            End Select
            If rndEscapeFormat Then
                If Int(2 * Rnd) = 1 Then result(j) = UCase(result(j))
            End If
            j = j + 1
        Else
            If codepoint < &H10000 Then
                copyChunkSize = copyChunkSize + 1
            Else
                copyChunkSize = copyChunkSize + 2
            End If
        End If
        If codepoint > &HFFFF& Then i = i + 1
    Next i
    If copyChunkSize > 0 Then _
        result(j) = Mid$(str, i - copyChunkSize, copyChunkSize)
    EscapeUnicode = Join(result, "")
End Function

'Replaces all occurences of unicode literals
'Accepts the following formattings `escapeFormat`:
'   efPython = 1 ... \uXXXX \u000XXXXX    (4 or 8 hex digits, 8 for chars outside BMP)
'   efRust   = 2 ... \u{XXXX} \U{XXXXXXX} (1 to 6 hex digits)
'   efUPlus  = 4 ... u+XXXX u+XXXXXX      (4 or 6 hex digits)
'   efMarkup = 8 ... &#ddddddd;           (1 to 7 decimal digits)
'Where:
'   - prefixes \u is case insensitive
'   - Xes are the digits of the codepoint in hexadecimal. (X = 0-9 or A-F/a-f)
'Example:
'   - "abcd &#97;u+0062\U0063xy\u{64}", efAll returns "abcd abcxyd"
'Notes:
'   - Avoid u+XXXX syntax if string contains literals without delimiters as it
'     can be misinterpreted if adjacent to text starting with 0-9 or a-f.
'   - This function also accepts all combinations of UnicodeEscapeFormats:
'       E.g.:
'UnescapeUnicode("abcd &#97;u+0062\U0063xy\u{64}", efMarkup Or efRust)
'       will return:
'"abcd au+0062\U0063xyd"
'   - By default, this function will not invalidate UTF-16 strings if they are
'     currently valid, but this can happen if `allowSingleSurrogates = True`
'     E.g.: EscapeUnicode(ChrU(&HD801&, True)) returns "\uD801", but this string
'     can no longer be un-escaped with UnescapeUnicode because "\uD801"
'     represents a surrogate halve which is invalid unicode on its own.
'     So UnescapeUnicode("\uD801") returns "\uD801" again, unless called with
'     the optional parameter `allowSingleSurrogates = False` like this
'     `UnescapeUnicode("\uD801", , True)`. This will return invalid UTF-16.
Public Function UnescapeUnicode(ByRef str As String, _
                       Optional ByVal escapeFormat As UnicodeEscapeFormat = efAll, _
                       Optional ByVal allowSingleSurrogates As Boolean = False) _
                                As String
    If escapeFormat < [_efMin] Or escapeFormat > [_efMax] Then
        Err.Raise 5, "EscapeUnicode", "Invalid escape format"
    End If

    Dim escapes() As EscapeSequence: escapes = NewEscapes()
    Dim lb As Long: lb = LBound(escapes)
    Dim ub As Long: ub = UBound(escapes)
    Dim i As Long

    For i = lb To ub 'Find first signature for each wanted format
        With escapes(i)
            If escapeFormat And .ueFormat Then
                .buffPosition = InStr(1, str, .ueSignature, vbBinaryCompare)
                .letSngSurrogate = allowSingleSurrogates
            End If
        End With
    Next i
    UnescapeUnicode = str 'Allocate buffer

    Const posByte As Byte = &H80
    Const buffSize As Long = 1024
    Dim buffSignaturePos(1 To buffSize) As Byte
    Dim buffFormat(1 To buffSize) As UnicodeEscapeFormat
    Dim buffEscIndex(1 To buffSize) As Long
    Dim posOffset As Long
    Dim diff As Long
    Dim highSur As Long
    Dim lowSur As Long
    Dim remainingLen As Long: remainingLen = Len(str)
    Dim posChar As String:    posChar = ChrB$(posByte)
    Dim outPos As Long:       outPos = 1
    Dim inPos As Long:        inPos = 1

    Do
        Dim upperLimit As Long: upperLimit = posOffset + buffSize
        For i = lb To ub 'Find all signatures within buffer size
            With escapes(i)
                Do Until .buffPosition = 0 Or .buffPosition > upperLimit
                    .buffPosition = .buffPosition - posOffset
                    buffSignaturePos(.buffPosition) = posByte
                    buffFormat(.buffPosition) = .ueFormat
                    buffEscIndex(.buffPosition) = i
                    .buffPosition = .buffPosition + .sigSize + posOffset
                    .buffPosition = InStr(.buffPosition, str, .ueSignature)
                Loop
            End With
        Next i

        Dim temp As String:  temp = buffSignaturePos
        Dim nextPos As Long: nextPos = InStrB(1, temp, posChar)

        Do Until nextPos = 0 'Unescape all found signatures from buffer
            i = buffEscIndex(nextPos)
            escapes(i).currPosition = nextPos + posOffset
            Select Case buffFormat(nextPos)
                Case efPython: TryPythonEscape escapes(i), str
                Case efRust:   TryRustEscape escapes(i), str
                Case efUPlus:  TryUPlusEscape escapes(i), str
                Case efMarkup: TryMarkupEscape escapes(i), str
            End Select
            With escapes(i)
                If .unEscSize > 0 Then
                    diff = .currPosition - inPos
                    If outPos > 1 Then
                        Mid$(UnescapeUnicode, outPos) = Mid$(str, inPos, diff)
                    End If
                    outPos = outPos + diff
                    If .unEscSize = 1 Then
                        Mid$(UnescapeUnicode, outPos) = ChrW$(.codepoint)
                    Else
                        .codepoint = .codepoint - &H10000
                        highSur = &HD800& Or (.codepoint \ &H400&)
                        lowSur = &HDC00& Or (.codepoint And &H3FF&)
                        Mid$(UnescapeUnicode, outPos) = ChrW$(highSur)
                        Mid$(UnescapeUnicode, outPos + 1) = ChrW$(lowSur)
                    End If
                    outPos = outPos + .unEscSize
                    inPos = .currPosition + .escSize
                    nextPos = nextPos + .escSize - .sigSize
                End If
                nextPos = InStrB(nextPos + .sigSize, temp, posChar)
            End With
        Loop
        remainingLen = remainingLen - buffSize
        posOffset = posOffset + buffSize
        Erase buffSignaturePos
    Loop Until remainingLen < 1

    If outPos > 1 Then
        diff = Len(str) - inPos + 1
        If diff > 0 Then
            Mid$(UnescapeUnicode, outPos, diff) = Mid$(str, inPos, diff)
        End If
        UnescapeUnicode = Left$(UnescapeUnicode, outPos + diff - 1)
    End If
End Function
Private Function NewEscapes() As EscapeSequence()
    Static escapes(0 To 6) As EscapeSequence
    If escapes(0).ueFormat = [_efNone] Then
        InitEscape escapes(0), efPython, "\U"
        InitEscape escapes(1), efPython, "\u"
        InitEscape escapes(2), efRust, "\U{"
        InitEscape escapes(3), efRust, "\u{"
        InitEscape escapes(4), efUPlus, "U+"
        InitEscape escapes(5), efUPlus, "u+"
        InitEscape escapes(6), efMarkup, "&#"
    End If
    NewEscapes = escapes
End Function
Private Sub InitEscape(ByRef escape As EscapeSequence, _
                       ByVal ueFormat As UnicodeEscapeFormat, _
                       ByRef ueSignature As String)
    With escape
        .ueFormat = ueFormat
        .ueSignature = ueSignature
        .sigSize = Len(ueSignature)
    End With
End Sub

Private Sub TryPythonEscape(ByRef escape As EscapeSequence, ByRef str As String)
    Const H As String = "[0-9A-Fa-f]"
    Const PYTHON_ESCAPE_PATTERN_NOT_BMP = "00[01]" & H & H & H & H & H
    Const PYTHON_ESCAPE_PATTERN_BMP As String = H & H & H & H & "*"
    Dim potentialEscape As String

    With escape
        .unEscSize = 0
        potentialEscape = Mid$(str, .currPosition + 2, 8) 'Exclude leading \[Uu]
        If potentialEscape Like PYTHON_ESCAPE_PATTERN_NOT_BMP Then
            .escSize = 10 '\[Uu]00[01]HHHHH
            .codepoint = CLng("&H" & potentialEscape) 'No extra Mid$ needed
            If .codepoint < &H10000 Then
                If IsValidBMP(.codepoint, .letSngSurrogate) Then
                    .unEscSize = 1
                    Exit Sub
                End If
            ElseIf .codepoint < &H110000 Then
                .unEscSize = 2
                Exit Sub
            End If
        End If
        If potentialEscape Like PYTHON_ESCAPE_PATTERN_BMP Then
            .escSize = 6 '\[Uu]HHHH
            .codepoint = CLng("&H" & Left$(potentialEscape, 4))
            If IsValidBMP(.codepoint, .letSngSurrogate) Then .unEscSize = 1
        End If
    End With
End Sub
Private Function IsValidBMP(ByVal codepoint As Long, _
                            ByVal letSingleSurrogate As Boolean) As Boolean
    IsValidBMP = (codepoint < &HD800& Or codepoint >= &HE000& Or letSingleSurrogate)
End Function

Private Sub TryRustEscape(ByRef escape As EscapeSequence, ByRef str As String)
    Static rustEscPattern(1 To 6) As String
    Static isPatternInit As Boolean
    Dim potentialEscape As String
    Dim nextBrace As Long

    If Not isPatternInit Then
        Dim i As Long
        rustEscPattern(1) = "[0-9A-Fa-f]}*"
        For i = 2 To 6
            rustEscPattern(i) = "[0-9A-Fa-f]" & rustEscPattern(i - 1)
        Next i
        isPatternInit = True
    End If
    With escape
        .unEscSize = 0
        potentialEscape = Mid$(str, .currPosition + 3, 7) 'Exclude leading \[Uu]{
        nextBrace = InStr(2, potentialEscape, "}", vbBinaryCompare)

        If nextBrace = 0 Then Exit Sub
        If Not potentialEscape Like rustEscPattern(nextBrace - 1) Then Exit Sub

        .codepoint = CLng("&H" & Left$(potentialEscape, nextBrace - 1))
        .escSize = nextBrace + 3
        If .codepoint < &H10000 Then
            If IsValidBMP(.codepoint, .letSngSurrogate) Then .unEscSize = 1
        ElseIf .codepoint < &H110000 Then
            .unEscSize = 2
        End If
    End With
End Sub

Private Sub TryUPlusEscape(ByRef escape As EscapeSequence, _
                           ByRef str As String)
    Const H As String = "[0-9A-Fa-f]"
    Const UPLUS_ESCAPE_PATTERN_4_DIGITS = H & H & H & H & "*"
    Const UPLUS_ESCAPE_PATTERN_5_DIGITS = H & H & H & H & H & "*"
    Const UPLUS_ESCAPE_PATTERN_6_DIGITS = H & H & H & H & H & H
    Dim potentialEscape As String

    With escape
        .unEscSize = 0
        potentialEscape = Mid$(str, .currPosition + 2, 6) 'Exclude leading [Uu]+
        If potentialEscape Like UPLUS_ESCAPE_PATTERN_6_DIGITS Then
            .escSize = 8
            .codepoint = CLng("&H" & potentialEscape)
            If .codepoint < &H10000 Then
                If IsValidBMP(.codepoint, .letSngSurrogate) Then
                    .unEscSize = 1
                    Exit Sub
                End If
            ElseIf .codepoint < &H110000 Then
                .unEscSize = 2
                Exit Sub
            End If
        End If
        If potentialEscape Like UPLUS_ESCAPE_PATTERN_5_DIGITS Then
            .escSize = 7
            .codepoint = CLng("&H" & Left$(potentialEscape, 5))
            If .codepoint < &H10000 Then
                If IsValidBMP(.codepoint, .letSngSurrogate) Then
                    .unEscSize = 1
                    Exit Sub
                End If
            Else
                .unEscSize = 2
                Exit Sub
            End If
        End If
        If potentialEscape Like UPLUS_ESCAPE_PATTERN_4_DIGITS Then
            .escSize = 6
            .codepoint = CLng("&H" & Left$(potentialEscape, 4))
            If IsValidBMP(.codepoint, .letSngSurrogate) Then .unEscSize = 1
        End If
    End With
End Sub
Private Sub TryMarkupEscape(ByRef escape As EscapeSequence, _
                            ByRef str As String)
    Static mEscPattern(1 To 7) As String
    Static isPatternInit As Boolean
    Dim potentialEscape As String
    Dim nextSemicolon As Long

    If Not isPatternInit Then
        Dim i As Long
        For i = 1 To 6
            mEscPattern(i) = String$(i, "#") & ";*"
        Next i
        mEscPattern(7) = "1######;"
        isPatternInit = True
    End If
    With escape
        .unEscSize = 0
        potentialEscape = Mid$(str, .currPosition + 2, 8) 'Exclude leading &[#]
        nextSemicolon = InStr(2, potentialEscape, ";", vbBinaryCompare)

        If nextSemicolon = 0 Then Exit Sub
        If Not potentialEscape Like mEscPattern(nextSemicolon - 1) Then Exit Sub

        .codepoint = CLng(Left$(potentialEscape, nextSemicolon - 1))
        .escSize = nextSemicolon + 2
        If .codepoint < &H10000 Then
            If IsValidBMP(.codepoint, .letSngSurrogate) Then .unEscSize = 1
        ElseIf .codepoint < &H110000 Then
            .unEscSize = 2
        End If
    End With
End Sub

'Returns the given unicode codepoint as standard VBA UTF-16LE string
Public Function ChrU(ByVal codepoint As Long, _
             Optional ByVal allowSingleSurrogates As Boolean = False) As String
    Const methodName As String = "ChrU"
    Static st As TwoCharTemplate
    Static lt As LongTemplate

    If codepoint < &H8000 Then Err.Raise 5, methodName, "Codepoint < -32768"
    If codepoint < 0 Then codepoint = codepoint And &HFFFF& 'Incase of uInt input

    If codepoint < &HD800& Then
        ChrU = ChrW$(codepoint)
    ElseIf codepoint < &HE000& And Not allowSingleSurrogates Then
        Err.Raise 5, methodName, "Range reserved for surrogate pairs"
    ElseIf codepoint < &H10000 Then
        ChrU = ChrW$(codepoint)
    ElseIf codepoint < &H110000 Then
        lt.l = (&HD800& Or (codepoint \ &H400& - &H40&)) _
            Or (&HDC00 Or (codepoint And &H3FF&)) * &H10000 '&HDC00 with no &
        LSet st = lt
        ChrU = st.s
    Else
        Err.Raise 5, methodName, "Codepoint outside of valid Unicode range."
    End If
End Function

'Returns a given characters unicode codepoint as long.
'Note: One unicode character can consist of two VBA "characters", a so-called
'      "surrogate pair" (input string of length 2, so Len(char) = 2!)
Public Function AscU(ByRef char As String) As Long
    AscU = AscW(char) And &HFFFF&
    If Len(char) > 1 Then
        Dim lo As Long: lo = AscW(Mid$(char, 2, 1)) And &HFFFF&
        If &HDC00& > lo Or lo > &HDFFF& Then Exit Function
        AscU = (AscU - &HD800&) * &H400& + (lo - &HDC00&) + &H10000
    End If
End Function

'Function transcoding a VBA-native UTF-16LE encoded string to an ANSI string
'Note: Information will be lost for codepoints > 255!
Public Function EncodeANSI(ByRef utf16leStr As String) As String
    Dim i As Long
    Dim j As Long:         j = 0
    Dim utf16le() As Byte: utf16le = utf16leStr
    Dim ansi() As Byte

    ReDim ansi(1 To Len(utf16leStr))
    For i = LBound(ansi) To UBound(ansi)
        If utf16le(j + 1) = 0 Then
            ansi(i) = utf16le(j)
            j = j + 2
        Else
            ansi(i) = &H3F 'Chr(&H3F) = "?"
            j = j + 2
        End If
    Next i
    EncodeANSI = ansi
End Function

'Function transcoding an UTF-8 encoded string to the VBA-native UTF-16LE
'Function transcoding an VBA-native UTF-16LE encoded string to UTF-8
Public Function DecodeUTF8(ByRef utf8Str As String, _
                  Optional ByVal raiseErrors As Boolean = False) As String

    Const methodName As String = "DecodeUTF8native"
    Dim i As Long
    Dim numBytesOfCodePoint As Byte

    Static numBytesOfCodePoints(0 To 255) As Byte
    Static mask(2 To 4) As Long
    Static minCp(2 To 4) As Long

    If numBytesOfCodePoints(0) = 0 Then
        For i = &H0& To &H7F&: numBytesOfCodePoints(i) = 1: Next i '0xxxxxxx
        '110xxxxx - C0 and C1 are invalid (overlong encoding)
        For i = &HC2& To &HDF&: numBytesOfCodePoints(i) = 2: Next i
        For i = &HE0& To &HEF&: numBytesOfCodePoints(i) = 3: Next i '1110xxxx
       '11110xxx - 11110100, 11110101+ (= &HF5+) outside of valid Unicode range
        For i = &HF0& To &HF4&: numBytesOfCodePoints(i) = 4: Next i
        For i = 2 To 4: mask(i) = (2 ^ (7 - i) - 1): Next i
        minCp(2) = &H80&: minCp(3) = &H800&: minCp(4) = &H10000
    End If

    Dim codepoint As Long
    Dim currByte As Byte
    Dim utf8() As Byte:  utf8 = utf8Str
    Dim utf16() As Byte: ReDim utf16(0 To (UBound(utf8) - LBound(utf8) + 1) * 2)
    Dim j As Long:       j = 0
    Dim k As Long

    i = LBound(utf8)
    Do While i <= UBound(utf8)
        codepoint = utf8(i)
        numBytesOfCodePoint = numBytesOfCodePoints(codepoint)

        If numBytesOfCodePoint = 0 Then
            If raiseErrors Then Err.Raise 5, methodName, "Invalid byte"
            GoTo insertErrChar
        ElseIf numBytesOfCodePoint = 1 Then
            utf16(j) = codepoint
            j = j + 2
        ElseIf i + numBytesOfCodePoint - 1 > UBound(utf8) Then
            If raiseErrors Then Err.Raise 5, methodName, _
                    "Incomplete UTF-8 codepoint at end of string."
            GoTo insertErrChar
        Else
            codepoint = utf8(i) And mask(numBytesOfCodePoint)

            For k = 1 To numBytesOfCodePoint - 1
                currByte = utf8(i + k)

                If (currByte And &HC0&) = &H80& Then
                    codepoint = (codepoint * &H40&) + (currByte And &H3F)
                Else
                    If raiseErrors Then _
                        Err.Raise 5, methodName, "Invalid continuation byte"
                    GoTo insertErrChar
                End If
            Next k
            'Convert the Unicode codepoint to UTF-16LE bytes
            If codepoint < minCp(numBytesOfCodePoint) Then
                If raiseErrors Then Err.Raise 5, methodName, "Overlong encoding"
                GoTo insertErrChar
            ElseIf codepoint < &HD800& Then
                utf16(j) = CByte(codepoint And &HFF&)
                utf16(j + 1) = CByte(codepoint \ &H100&)
                j = j + 2
            ElseIf codepoint < &HE000& Then
                If raiseErrors Then Err.Raise 5, methodName, _
                "Invalid Unicode codepoint.(Range reserved for surrogate pairs)"
                GoTo insertErrChar
            ElseIf codepoint < &H10000 Then
                If codepoint = &HFEFF& Then GoTo nextCp '(BOM - will be ignored)
                utf16(j) = codepoint And &HFF&
                utf16(j + 1) = codepoint \ &H100&
                j = j + 2
            ElseIf codepoint < &H110000 Then 'Calculate surrogate pair
                Dim m As Long:           m = codepoint - &H10000
                Dim loSurrogate As Long: loSurrogate = &HDC00& Or (m And &H3FF)
                Dim hiSurrogate As Long: hiSurrogate = &HD800& Or (m \ &H400&)

                utf16(j) = hiSurrogate And &HFF&
                utf16(j + 1) = hiSurrogate \ &H100&
                utf16(j + 2) = loSurrogate And &HFF&
                utf16(j + 3) = loSurrogate \ &H100&
                j = j + 4
            Else
                If raiseErrors Then Err.Raise 5, methodName, _
                        "Codepoint outside of valid Unicode range"
insertErrChar:  utf16(j) = &HFD
                utf16(j + 1) = &HFF
                j = j + 2

                If numBytesOfCodePoint = 0 Then numBytesOfCodePoint = 1
            End If
        End If
nextCp: i = i + numBytesOfCodePoint 'Move to the next UTF-8 codepoint
    Loop
    DecodeUTF8 = MidB$(utf16, 1, j)
End Function

注意:

EncodeANSI
函数可能会像这样被“滥用”,因为转义字符串中的 UTF-8 字节将始终被解码为单字节 UTF-16 字符,因为根据定义,它们是单字节。这意味着
EncodeANSI
函数仅用于删除字符串中的每隔一个字节(由于 UTF-16 表示单字节字符的方式,这些字节都为空。)生成的字符串是您所输入的字符串的 UTF-8 表示形式。然后我们“解码”(转换为 UTF-16),因为这是 vb6 表示 Unicode 字符串的本机方式。

我也包含了

EscapeUnicode
函数,因此您可以看到字符串作为转义的 unicode 代码点实际上应该是什么样子:

actualEscapeSequence = EscapeUnicode(DecodeUTF8(EncodeANSI(UnescapeUnicode(StrResult))))

actualEscapeSequence
将等于“\u0633\u0644\u0627\u0645 \u062F\u0646\u06CC\u0627”,您可以在here确认它是正确的unicode转义序列。

© www.soinside.com 2019 - 2024. All rights reserved.