Flex(词法分析器)- 匹配 unicode

ascSymbol     !|#|$|%|&|⋆|+|.|/|<|=|>|?|@|\|^|-|~|:
uniSymbol     \p{Symbol}|\p{Other_Symbol}|\p{Punctuation}
symbol        ascSymbol|uniSymbol{-}[^|_"',;]


例如,我正在使用 cmake,它被配置为在构建时从 *.l 和 *.y 文件生成词法分析器/解析器。理想情况下,我想要一个不需要安装 GHC 或其他 Haskell 编译器的解决方案。

ruby unicode cmake flex-lexer

事实证明,在 Flex 中获得 unicode 支持将是一件痛苦的事情,除非 Flex 源代码本身添加它。那里似乎有一些针对 unicode 的实验性内容,但从未将其制作成我能找到的版本。

Ragel 文档 很有洞察力,并且内置了对 Unicode 的支持。我后来发现了这篇文章,它给出了如何让 Ragel 和 C++ 很好地配合的示例。似乎是更好的选择,所以就这样吧。



上面所说的“内置支持”或许有些夸张。获得 unicode 支持变得更加容易,但这不仅仅是开箱即用的事情。 使用 cmake,我从派生的 UCD 7 文件生成状态机。 在 CMakeLists.txt 中我这样做:

#Ruby is required to generate a unicode Ragel machine FIND_PACKAGE(Ruby REQUIRED) MESSAGE("Found Ruby ${RUBY_VERSION}") SET(UNICODE_MACHINE_PATH "${PROJECT_SOURCE_DIR}/src/unicode.rl") if(NOT EXISTS ${UNICODE_MACHINE_PATH} OR gen_unicode) MESSAGE("Attempting to generate unicode state machine") EXECUTE_PROCESS(COMMAND ${RUBY_EXECUTABLE} ${PROJECT_SOURCE_DIR}/unicode2ragel.rb OUTPUT_FILE ${UNICODE_MACHINE_PATH} RESULT_VARIABLE RAGEL_UNICODE_GEN_RES) if(${RAGEL_UNICODE_GEN_RES} EQUAL 0) MESSAGE("Generaged Ragel Unicode state machine") else() MESSAGE(SEND_ERROR "Unable to generate unicode state machine") endif() endif()

然后在 unicode2ragel.rb 中(随 Ragel 一起提供,并针对 UCD 7 进行了轻微修改)

#!/usr/bin/env ruby # # This script uses the unicode spec to generate a Ragel state machine # that recognizes unicode alphanumeric characters. It generates 5 # character classes: uupper, ulower, ualpha, udigit, and ualnum. # Currently supported encodings are UTF-8 [default] and UCS-4. # # Usage: unicode2ragel.rb [options] # -e, --encoding [ucs4 | utf8] Data encoding # -h, --help Show this message # # This script was originally written as part of the Ferret search # engine library. # # Author: Rakan El-Khalil <[email protected]> require 'optparse' require 'open-uri' ENCODINGS = [ :utf8, :ucs4 ] ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" } CHART_URL = "http://www.unicode.org/Public/7.0.0/ucd/extracted/DerivedGeneralCategory.txt"#"http://www.unicode.org/Public/7.0.0/ucd/DerivedCoreProperties.txt" ### # Display vars & default option TOTAL_WIDTH = 80 RANGE_WIDTH = 23 @encoding = :utf8 ### # Option parsing cli_opts = OptionParser.new do |opts| opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o| @encoding = o.downcase.to_sym end opts.on("-h", "--help", "Show this message") do puts opts exit end end cli_opts.parse(ARGV) unless ENCODINGS.member? @encoding puts "Invalid encoding: #{@encoding}" puts cli_opts exit end ## # Downloads the document at url and yields every alpha line's hex # range and description. def each_alpha( url, property ) open( url ) do |file| file.each_line do |line| next if line =~ /^#/; next if line !~ /; #{property} #/; range, description = line.split(/;/) range.strip! description.gsub!(/.*#/, '').strip! if range =~ /\.\./ start, stop = range.split '..' else start = stop = range end yield start.hex .. stop.hex, description end end end ### # Formats to hex at minimum width def to_hex( n ) r = "%0X" % n r = "0#{r}" unless (r.length % 2).zero? r end ### # UCS4 is just a straight hex conversion of the unicode codepoint. def to_ucs4( range ) rangestr = "0x" + to_hex(range.begin) rangestr << "..0x" + to_hex(range.end) if range.begin != range.end [ rangestr ] end ## # 0x00 - 0x7f -> 0zzzzzzz[7] # 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6] # 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6] # 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff] def to_utf8_enc( n ) r = 0 if n <= 0x7f r = n elsif n <= 0x7ff y = 0xc0 | (n >> 6) z = 0x80 | (n & 0x3f) r = y << 8 | z elsif n <= 0xffff x = 0xe0 | (n >> 12) y = 0x80 | (n >> 6) & 0x3f z = 0x80 | n & 0x3f r = x << 16 | y << 8 | z elsif n <= 0x10ffff w = 0xf0 | (n >> 18) x = 0x80 | (n >> 12) & 0x3f y = 0x80 | (n >> 6) & 0x3f z = 0x80 | n & 0x3f r = w << 24 | x << 16 | y << 8 | z end to_hex(r) end def from_utf8_enc( n ) n = n.hex r = 0 if n <= 0x7f r = n elsif n <= 0xdfff y = (n >> 8) & 0x1f z = n & 0x3f r = y << 6 | z elsif n <= 0xefffff x = (n >> 16) & 0x0f y = (n >> 8) & 0x3f z = n & 0x3f r = x << 10 | y << 6 | z elsif n <= 0xf7ffffff w = (n >> 24) & 0x07 x = (n >> 16) & 0x3f y = (n >> 8) & 0x3f z = n & 0x3f r = w << 18 | x << 12 | y << 6 | z end r end ### # Given a range, splits it up into ranges that can be continuously # encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff] # This is not strictly needed since the current [5.1] unicode standard # doesn't have ranges that straddle utf8 boundaries. This is included # for completeness as there is no telling if that will ever change. def utf8_ranges( range ) ranges = [] UTF8_BOUNDARIES.each do |max| if range.begin <= max return ranges << range if range.end <= max ranges << range.begin .. max range = (max + 1) .. range.end end end ranges end def build_range( start, stop ) size = start.size/2 left = size - 1 return [""] if size < 1 a = start[0..1] b = stop[0..1] ### # Shared prefix if a == b return build_range(start[2..-1], stop[2..-1]).map do |elt| "0x#{a} " + elt end end ### # Unshared prefix, end of run return ["0x#{a}..0x#{b} "] if left.zero? ### # Unshared prefix, not end of run # Range can be 0x123456..0x56789A # Which is equivalent to: # 0x123456 .. 0x12FFFF # 0x130000 .. 0x55FFFF # 0x560000 .. 0x56789A ret = [] ret << build_range(start, a + "FF" * left) ### # Only generate middle range if need be. if a.hex+1 != b.hex max = to_hex(b.hex - 1) max = "FF" if b == "FF" ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left end ### # Don't generate last range if it is covered by first range ret << build_range(b + "00" * left, stop) unless b == "FF" ret.flatten! end def to_utf8( range ) utf8_ranges( range ).map do |r| build_range to_utf8_enc(r.begin), to_utf8_enc(r.end) end.flatten! end ## # Perform a 3-way comparison of the number of codepoints advertised by # the unicode spec for the given range, the originally parsed range, # and the resulting utf8 encoded range. def count_codepoints( code ) code.split(' ').inject(1) do |acc, elt| if elt =~ /0x(.+)\.\.0x(.+)/ if @encoding == :utf8 acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1) else acc * ($2.hex - $1.hex + 1) end else acc end end end def is_valid?( range, desc, codes ) spec_count = 1 spec_count = $1.to_i if desc =~ /\[(\d+)\]/ range_count = range.end - range.begin + 1 sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) } sum == spec_count and sum == range_count end ## # Generate the state maching to stdout def generate_machine( name, property ) pipe = " " puts " #{name} = " each_alpha( CHART_URL, property ) do |range, desc| codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range) raise "Invalid encoding of range #{range}: #{codes.inspect}" unless is_valid? range, desc, codes range_width = codes.map { |a| a.size }.max range_width = RANGE_WIDTH if range_width < RANGE_WIDTH desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11 desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH if desc.size > desc_width desc = desc[0..desc_width - 4] + "..." end codes.each_with_index do |r, idx| desc = "" unless idx.zero? code = "%-#{range_width}s" % r puts " #{pipe} #{code} ##{desc}" pipe = "|" end end puts " ;" puts "" end puts <<EOF # The following Ragel file was autogenerated from: #{CHART_URL} # # It defines ualpha, udigit, ualnum. # # To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]}, # and that your input is in #{@encoding}. %%{ machine WChar; EOF generate_machine( :uUppercaseLetter, "Lu" ) generate_machine( :uLowercaseLetter, "Ll" ) generate_machine( :uTitlecaseLetter, "Lt" ) generate_machine( :uModifierLetter, "Lm" ) generate_machine( :uOtherLetter, "Lo" ) generate_machine( :uNonspacingMark, "Mn" ) generate_machine( :uEnclosingMark, "Me" ) generate_machine( :uSpacingMark, "Mc" ) generate_machine( :uDecimalNumber, "Nd" ) generate_machine( :uLetterNumber, "Nl" ) generate_machine( :uOtherNumber, "No" ) generate_machine( :uSpaceSeparator, "Zs" ) generate_machine( :uLineSeparator, "Zl" ) generate_machine( :uParagraphSeparator, "Zp" ) generate_machine( :uFormat, "Cf" ) generate_machine( :uPrivateUse, "Co" ) generate_machine( :uSurrogate, "Cs" ) generate_machine( :uDashPunctuation, "Pd" ) generate_machine( :uOpenPunctuation, "Ps" ) generate_machine( :uClosePunctuation, "Pe" ) generate_machine( :uConnectorPunctuation, "Pc" ) generate_machine( :uOtherPunctuation, "Po" ) generate_machine( :uMathSymbol, "Sm" ) generate_machine( :uCurrencySymbol, "Sc" ) generate_machine( :uModifierSymbol, "Sk" ) generate_machine( :uOtherSymbol, "So" ) generate_machine( :uInitialPunctuation, "Pi" ) generate_machine( :uFinalPunctuation, "Pf" ) puts <<EOF }%% EOF

然后在你的 ragel 机器文件中,你可以包含 unicode.rl 并访问定义的每个 unicode 组,例如 uUppercaseLetter 等等...

