据我所知,Excel使用UTF-16来表示字符串文字。我从控制台 (Mac) / 文件 (Windows) 读取,在这两种情况下,字符编码都被弄乱了。我必须找到一个适用于两个平台的解决方案,所以 ADO 流不是一个选项。我做了一些调试,我看到实际字节是:
字节 |显示为 |应该是 |正确字节 258,129 | Ă | Á | 193 258,356 | ĂŤ | Í| 205 313,176 | ° | | 219 313,144 | Ĺ | Ő | 213 258,347 | ₛ | ü | 220 258,8211 | Ă– |哦 | 214 258,353 | Ăš | Ú | 218 258,8220 | Ă“ | Ó | 211 258,8240 | | É | 201
(来自古老的匈牙利语测试短语 ÁRVÍZTŰRŐ TÜKÖRFÚRÓGÉP,其中包含我们所有的特殊字符)。 我正在寻找一种在 Mac 和 Windows 上都能产生正确字符串的算法。 谢谢!
到目前为止发布的答案都无法正确转码包含完整 Unicode 范围内代码点的输入字符串,例如“😀👩u200d👩🦲👩👩u200d👩u200d👧u200d👦🦲u200d👩u200d👧u200d👦🦲u200d👧u200d👦 UnicodeSupport𐀀est😀u200d👩👩u200d👩u200d👧u200d👦💁🏼u200d♀️🧔🏻u200d♂️👩u200d❤️u200d👨🏃🏻u200d♀️”。
这就是我编写以下函数的原因,仅使用 Windows 和 MacOS 上都可用的 VBA 内置函数/语句。
此功能适用于跨平台和跨应用程序,适用于整个 Unicode 范围。
codePoints > 65535
也受支持,即使内置的 VBA ChrW()
和 AscW
不支持它们,因为转码完全是“手动”完成的,包括代理对。性能也应该相对较好,因为该函数适用于单个字节数组缓冲区。如果有人发现错误或改进,请告诉我!
由于this answer on CodeReview,此代码得到了改进,非常感谢Cristian Buse!
'Function transcoding an UTF-8 encoded string to the VBA-native UTF-16-LE
'Author: Guido Witt-Dörring, https://stackoverflow.com/a/75787820/12287457
' https://github.com/guwidoe/VBA-StringTools
Public Function DecodeUTF8(ByVal utf8str As String, _
Optional ByVal raiseErrors As Boolean = False) As String
Const methodName As String = "DecodeUTF8"
Dim i As Long, j As Long, k As Long, numBytesOfCodePoint As Byte
Static numBytesOfCodePoints(0 To 255) As Byte
Static mask(2 To 4) As Long, minCp(2 To 4) As Long
If numBytesOfCodePoints(0) = 0 Then
For i = &H0& To &H7F&: numBytesOfCodePoints(i) = 1: Next i '0xxxxxxx
'110xxxxx - C0 and C1 are invalid (overlong encoding)
For i = &HC2& To &HDF&: numBytesOfCodePoints(i) = 2: Next i
For i = &HE0& To &HEF&: numBytesOfCodePoints(i) = 3: Next i '1110xxxx
'11110xxx - 11110100, 11110101+ (= &HF5+) outside of valid Unicode range
For i = &HF0& To &HF4&: numBytesOfCodePoints(i) = 4: Next i
For i = 2 To 4: mask(i) = (2 ^ (7 - i) - 1): Next i
minCp(2) = &H80&: minCp(3) = &H800&: minCp(4) = &H10000
End If
Dim utf8() As Byte, utf16() As Byte, codePoint As Long, currByte As Byte
utf8 = utf8str
ReDim utf16(0 To (UBound(utf8) - LBound(utf8) + 1) * 2)
i = LBound(utf8): j = 0
Do While i <= UBound(utf8)
codePoint = utf8(i)
numBytesOfCodePoint = numBytesOfCodePoints(codePoint)
If numBytesOfCodePoint = 0 Then
If raiseErrors Then Err.Raise 5, methodName, "Invalid byte"
GoTo insertErrChar
ElseIf numBytesOfCodePoint = 1 Then
utf16(j) = codePoint
j = j + 2
ElseIf i + numBytesOfCodePoint - 1 > UBound(utf8) Then
If raiseErrors Then _
Err.Raise 5, methodName, _
"Incomplete UTF-8 codepoint at end of string."
GoTo insertErrChar
Else
codePoint = utf8(i) And mask(numBytesOfCodePoint)
For k = 1 To numBytesOfCodePoint - 1
currByte = utf8(i + k)
If (currByte And &HC0&) = &H80& Then
codePoint = (codePoint * &H40&) + (currByte And &H3F)
Else
If raiseErrors Then _
Err.Raise 5, methodName, "Invalid continuation byte"
GoTo insertErrChar
End If
Next k
'Convert the Unicode codepoint to UTF-16LE bytes
If codePoint < minCp(numBytesOfCodePoint) Then
If raiseErrors Then _
Err.Raise 5, methodName, "Overlong encoding"
GoTo insertErrChar
ElseIf codePoint < &HD800& Then
utf16(j) = CByte(codePoint And &HFF&)
utf16(j + 1) = CByte(codePoint \ &H100&)
j = j + 2
ElseIf codePoint < &HE000& Then
If raiseErrors Then _
Err.Raise 5, methodName, _
"Invalid Unicode codepoint.(Range reserved for surrogate pairs)"
GoTo insertErrChar
ElseIf codePoint < &H10000 Then
If codePoint = &HFEFF& Then GoTo nextCp '(BOM - will be ignored)
utf16(j) = codePoint And &HFF&
utf16(j + 1) = codePoint \ &H100&
j = j + 2
ElseIf codePoint < &H110000 Then 'Calculate surrogate pair
Dim m As Long, lowSurrogate As Long, highSurrogate As Long
m = codePoint - &H10000 '(m \ &H400&) =most sign. 10 bits of m
highSurrogate = &HD800& Or (m \ &H400&)
lowSurrogate = &HDC00& Or (m And &H3FF) 'least sig. 10 bits of m
utf16(j) = highSurrogate And &HFF&
utf16(j + 1) = highSurrogate \ &H100&
utf16(j + 2) = lowSurrogate And &HFF&
utf16(j + 3) = lowSurrogate \ &H100&
j = j + 4
Else
If raiseErrors Then _
Err.Raise 5, methodName, _
"Codepoint outside of valid Unicode range"
insertErrChar: utf16(j) = &HFD: utf16(j + 1) = &HFF: j = j + 2
If numBytesOfCodePoint = 0 Then numBytesOfCodePoint = 1
End If
End If
nextCp: i = i + numBytesOfCodePoint 'Move to the next UTF-8 codepoint
Loop
DecodeUTF8 = MidB$(utf16, 1, j)
End Function
因为我必须解决这个问题,所以我想出了以下函数,可以成功转换 128 到 255 之间的字符
Private Function utf8ToUTF16(ByVal strText As String) As String
Dim i&, l1%, l2%, l3%
For i = 1 To Len(strText)
l1 = Asc(Mid(strText, i, 1))
If i <> Len(strText) Then l2 = Asc(Mid(strText, i + 1, 1))
Select Case l1
Case 194
utf8ToUTF16 = utf8ToUTF16 & WorksheetFunction.Unichar(l2): i = i + 1
Case 195
utf8ToUTF16 = utf8ToUTF16 & WorksheetFunction.Unichar(l2 + &H40): i = i + 1
Case 197
utf8ToUTF16 = utf8ToUTF16 & WorksheetFunction.Unichar(l2 + &HC0): i = i + 1
Case 203
utf8ToUTF16 = utf8ToUTF16 & WorksheetFunction.Unichar(l2 + &H240): i = i + 1
Case 226
If l2 = 128 Then
l3 = Asc(Mid(strText, i + 2, 1))
utf8ToUTF16 = utf8ToUTF16 & WorksheetFunction.Unichar(l3 + &H1F80)
i = i + 2
ElseIf l2 = 130 Then
l3 = Asc(Mid(strText, i + 2, 1))
utf8ToUTF16 = utf8ToUTF16 & WorksheetFunction.Unichar(l3 + &H2000)
i = i + 2
End If
Case Else
utf8ToUTF16 = utf8ToUTF16 & Chr(l1)
End Select
Next i
End Function
现在将“ĂRVĂŤZTĹ°RĹ TĂśKĂ–RFĂšRĂ“GÉP”传递给此函数(从标准UTF-8编码文件读取)将返回“ÁRVÍZTðRŐTÜKÖRFÚRÓGÉP”。
编辑
现在我有时间清理这个了。
Private Function utf8ToUTF16(ByVal strText As String) As String
Dim i&, l1&, l2&, l3&, l4&, l&
For i = 1 To Len(strText)
l1 = Asc(Mid(strText, i, 1))
If i + 1 <= Len(strText) Then l2 = Asc(Mid(strText, i + 1, 1))
If i + 2 <= Len(strText) Then l3 = Asc(Mid(strText, i + 2, 1))
If i + 3 <= Len(strText) Then l4 = Asc(Mid(strText, i + 3, 1))
Select Case l1
Case 1 To 127
l = l1
Case 194 To 223
l = ((l1 And &H1F) * 2 ^ 6) Or (l2 And &H3F)
i = i + 1
Case 224 To 239
l = ((l1 And &HF) * 2 ^ 12) Or ((l2 And &H3F) * 2 ^ 6) Or (l3 And &H3F)
i = i + 2
Case 240 To 255
l = ((l1 And &H7) * 2 ^ 18) Or ((l2 And &H3F) * 2 ^ 12) Or ((l3 And &H3F) * 2 ^ 6) Or (l4 And &H3F)
i = i + 4
Case Else
l = 63 ' question mark
End Select
utf8ToUTF16 = utf8ToUTF16 & IIf(l < 55296, WorksheetFunction.Unichar(l), "?")
Next i
End Function
我意识到,55295 (D7FF) 以上的字符不会出现,所以它会输出一个问号作为占位符。
我发现这个简单的 VBA 代码非常适合文本文件中的西班牙重音字符。给定一个带有 UTF-8 双字符的字符串,它返回一个带有重音字符的字符串:
Function UTF8to16(str As String) As String
Dim position As Long, strConvert As String, codeReplace As Integer, strOut As String
strOut = str
position = InStr(strOut, Chr(195))
If position > 0 Then
Do Until position = 0
strConvert = Mid(strOut, position, 2)
codeReplace = Asc(Right(strConvert, 1))
If codeReplace < 255 Then
strOut = Replace(strOut, strConvert, Chr(codeReplace + 64))
Else
strOut = Replace(strOut, strConvert, Chr(34))
End If
position = InStr(strOut, Chr(195))
Loop
End If
UTF8to16 = strOut
End Function
这对我有用
Function utf8ToUTF16(ByVal strText As String) As String
Dim binText() As Byte, i As Long
'size the binary buffer
ReDim binText(Len(strText) - 1 + 3)
'insert BOM in 0,1,2 positions
binText(0) = &HEF
binText(1) = &HBB
binText(2) = &HBF
'append text characters
For i = 1 To Len(strText)
binText(i + 2) = Asc(Mid(strText, i, 1))
Next
'write to a binary stream
With CreateObject("ADODB.Stream")
.Type = 1
.Open
.write binText
'convert to the text stream
.Flush
.Position = 0
.Type = 2
.Charset = "utf-8"
'read the result skipping BOM
.Position = 3
utf8ToUTF16 = .ReadText
.Close
End With
End Function