有效的完整 utf8 序列中的一个可打印 latin1 字符

Question

我正在尝试使用 Golang 的 libpq 驱动程序将电子邮件地址插入到 Postgres 中。错误信息是

无法执行查询。 pq：编码“UTF8”中字节序列为 0xf0 0x9f 0x9a 0x80 的字符在编码“LATIN1”中没有等效项

事实上，只有 0xf0 是可打印的 latin1 字符，我不认为那些是 latin1 字符。完整序列是 :rocket: UTF8 字符。

这是电子邮件地址。

author := "🚀 Steven Duncan 🌌 <[email protected]>"

我正在尝试使用 SanitizeSQL 函数来捕获此错误，但它没有捕获

sqlStatement := `INSERT INTO logs (author) VALUES ($1) RETURNING id`

sql_with_args, err := SanitizeSQL(sqlStatement, author)

这是代码（完整演示在 google play）

func SanitizeSQL(sql string, args ...any) (string, error) {
    query, err := NewQuery(sql)
    if err != nil {
        return "", err
    }
    return query.Sanitize(args...)
}

type Part any

type Query struct {
    Parts []Part
}

// utf.DecodeRune returns the utf8.RuneError for errors. But that is actually rune U+FFFD -- the unicode replacement
// character. utf8.RuneError is not an error if it is also width 3.
//
// https://github.com/jackc/pgx/issues/1380
const replacementcharacterwidth = 3

func (q *Query) Sanitize(args ...any) (string, error) {
    argUse := make([]bool, len(args))
    buf := &bytes.Buffer{}

    for _, part := range q.Parts {
        var str string
        switch part := part.(type) {
        case string:
            str = part
        case int:
            argIdx := part - 1

            if argIdx < 0 {
                return "", fmt.Errorf("first sql argument must be > 0")
            }

            if argIdx >= len(args) {
                return "", fmt.Errorf("insufficient arguments")
            }
            arg := args[argIdx]
            switch arg := arg.(type) {
            case nil:
                str = "null"
            case int64:
                str = strconv.FormatInt(arg, 10)
            case float64:
                str = strconv.FormatFloat(arg, 'f', -1, 64)
            case bool:
                str = strconv.FormatBool(arg)
            case []byte:
                str = QuoteBytes(arg)
            case string:
                str = QuoteString(arg)
            case time.Time:
                str = arg.Truncate(time.Microsecond).Format("'2006-01-02 15:04:05.999999999Z07:00:00'")
            default:
                return "", fmt.Errorf("invalid arg type: %T", arg)
            }
            argUse[argIdx] = true

            // Prevent SQL injection via Line Comment Creation
            // https://github.com/jackc/pgx/security/advisories/GHSA-m7wr-2xf7-cm9p
            str = " " + str + " "
        default:
            return "", fmt.Errorf("invalid Part type: %T", part)
        }
        buf.WriteString(str)
    }

    for i, used := range argUse {
        if !used {
            return "", fmt.Errorf("unused argument: %d", i)
        }
    }
    return buf.String(), nil
}

func NewQuery(sql string) (*Query, error) {
    l := &sqlLexer{
        src:     sql,
        stateFn: rawState,
    }

    for l.stateFn != nil {
        l.stateFn = l.stateFn(l)
    }

    query := &Query{Parts: l.parts}

    return query, nil
}

func QuoteString(str string) string {
    return "'" + strings.ReplaceAll(str, "'", "''") + "'"
}

func QuoteBytes(buf []byte) string {
    return `'\x` + hex.EncodeToString(buf) + "'"
}

type sqlLexer struct {
    src     string
    start   int
    pos     int
    nested  int // multiline comment nesting level.
    stateFn stateFn
    parts   []Part
}

type stateFn func(*sqlLexer) stateFn

func rawState(l *sqlLexer) stateFn {
    for {
        r, width := utf8.DecodeRuneInString(l.src[l.pos:])
        l.pos += width

        switch r {
        case 'e', 'E':
            nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:])
            if nextRune == '\'' {
                l.pos += width
                return escapeStringState
            }
        case '\'':
            return singleQuoteState
        case '"':
            return doubleQuoteState
        case '$':
            nextRune, _ := utf8.DecodeRuneInString(l.src[l.pos:])
            if '0' <= nextRune && nextRune <= '9' {
                if l.pos-l.start > 0 {
                    l.parts = append(l.parts, l.src[l.start:l.pos-width])
                }
                l.start = l.pos
                return placeholderState
            }
        case '-':
            nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:])
            if nextRune == '-' {
                l.pos += width
                return oneLineCommentState
            }
        case '/':
            nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:])
            if nextRune == '*' {
                l.pos += width
                return multilineCommentState
            }
        case utf8.RuneError:
            if width != replacementcharacterwidth {
                if l.pos-l.start > 0 {
                    l.parts = append(l.parts, l.src[l.start:l.pos])
                    l.start = l.pos
                }
                return nil
            }
        }
    }
}

func singleQuoteState(l *sqlLexer) stateFn {
    for {
        r, width := utf8.DecodeRuneInString(l.src[l.pos:])
        l.pos += width

        switch r {
        case '\'':
            nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:])
            if nextRune != '\'' {
                return rawState
            }
            l.pos += width
        case utf8.RuneError:
            if width != replacementcharacterwidth {
                if l.pos-l.start > 0 {
                    l.parts = append(l.parts, l.src[l.start:l.pos])
                    l.start = l.pos
                }
                return nil
            }
        }
    }
}

func doubleQuoteState(l *sqlLexer) stateFn {
    for {
        r, width := utf8.DecodeRuneInString(l.src[l.pos:])
        l.pos += width

        switch r {
        case '"':
            nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:])
            if nextRune != '"' {
                return rawState
            }
            l.pos += width
        case utf8.RuneError:
            if width != replacementcharacterwidth {
                if l.pos-l.start > 0 {
                    l.parts = append(l.parts, l.src[l.start:l.pos])
                    l.start = l.pos
                }
                return nil
            }
        }
    }
}

// placeholderState consumes a placeholder value. The $ must have already has
// already been consumed. The first rune must be a digit.
func placeholderState(l *sqlLexer) stateFn {
    num := 0

    for {
        r, width := utf8.DecodeRuneInString(l.src[l.pos:])
        l.pos += width

        if '0' <= r && r <= '9' {
            num *= 10
            num += int(r - '0')
        } else {
            l.parts = append(l.parts, num)
            l.pos -= width
            l.start = l.pos
            return rawState
        }
    }
}

func escapeStringState(l *sqlLexer) stateFn {
    for {
        r, width := utf8.DecodeRuneInString(l.src[l.pos:])
        l.pos += width

        switch r {
        case '\\':
            _, width = utf8.DecodeRuneInString(l.src[l.pos:])
            l.pos += width
        case '\'':
            nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:])
            if nextRune != '\'' {
                return rawState
            }
            l.pos += width
        case utf8.RuneError:
            if width != replacementcharacterwidth {
                if l.pos-l.start > 0 {
                    l.parts = append(l.parts, l.src[l.start:l.pos])
                    l.start = l.pos
                }
                return nil
            }
        }
    }
}

func oneLineCommentState(l *sqlLexer) stateFn {
    for {
        r, width := utf8.DecodeRuneInString(l.src[l.pos:])
        l.pos += width

        switch r {
        case '\\':
            _, width = utf8.DecodeRuneInString(l.src[l.pos:])
            l.pos += width
        case '\n', '\r':
            return rawState
        case utf8.RuneError:
            if width != replacementcharacterwidth {
                if l.pos-l.start > 0 {
                    l.parts = append(l.parts, l.src[l.start:l.pos])
                    l.start = l.pos
                }
                return nil
            }
        }
    }
}

func multilineCommentState(l *sqlLexer) stateFn {
    for {
        r, width := utf8.DecodeRuneInString(l.src[l.pos:])
        l.pos += width

        switch r {
        case '/':
            nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:])
            if nextRune == '*' {
                l.pos += width
                l.nested++
            }
        case '*':
            nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:])
            if nextRune != '/' {
                continue
            }

            l.pos += width
            if l.nested == 0 {
                return rawState
            }
            l.nested--

        case utf8.RuneError:
            if width != replacementcharacterwidth {
                if l.pos-l.start > 0 {
                    l.parts = append(l.parts, l.src[l.start:l.pos])
                    l.start = l.pos
                }
                return nil
            }
        }
    }
}

Answer 1

无法将无法以 LATIN1 编码的字符存储在编码为 LATIN1 的数据库中。实际上只有一个好的解决方案：使用

UTF8

编码创建一个新数据库，转储数据库并将其恢复到

UTF8

数据库中。

有效的完整 utf8 序列中的一个可打印 latin1 字符

问题描述投票：0回答：1

1个回答

最新问题

有效的完整 utf8 序列中的一个可打印 latin1 字符

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1