我正在我们的软件中调试现有的 Boost QI 语法以解析“端点”(
host:port
,其中 host
可以是主机名、IPv4 地址或 IPv6 地址)。我在 hostname
部分特别有问题(端口、IPv4 和 IPv6 部分都可以正常工作)。完整的代码很广泛,所以我将其简化为以下示例,可以轻松地在 Wandbox (C++17, Boost 1.79.0) 上运行
#include <iostream>
#include <string>
#include <boost/fusion/include/std_pair.hpp>
#include <boost/fusion/include/boost_array.hpp>
#include <boost/phoenix.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/algorithm/string.hpp>
class Endpoint
{
public:
Endpoint(): _host(""), _port(0) {};
std::string _host;
unsigned short _port;
};
template < typename ITERATOR >
struct HostnameGrammar :
boost::spirit::qi::grammar< ITERATOR, std::string() >
{
HostnameGrammar()
:
HostnameGrammar::base_type( start )
{
using boost::spirit::qi::alnum;
using boost::spirit::qi::alpha;
using boost::spirit::qi::char_;
using boost::spirit::_1;
start %=
dottedName | singleName
;
dottedName %=
+( singleName >> char_( '.' ) ) >>
tld
;
singleName %=
+alnum >> *( char_('-') >> +alnum )
;
tld %=
+alpha >> *( -char_('-') >> +alnum )
;
}
boost::spirit::qi::rule< ITERATOR, std::string() > start;
boost::spirit::qi::rule< ITERATOR, std::string() > dottedName;
boost::spirit::qi::rule< ITERATOR, std::string() > singleName;
boost::spirit::qi::rule< ITERATOR, std::string() > tld;
};
template < typename ITERATOR >
struct EndpointGrammar :
boost::spirit::qi::grammar< ITERATOR, std::string() >
{
EndpointGrammar(
Endpoint & endpoint )
:
EndpointGrammar::base_type( start ),
endpoint_( endpoint )
{
using boost::spirit::qi::ushort_;
using boost::spirit::_1;
start =
-address[ boost::phoenix::ref( endpoint._host ) = _1 ] >>
-( ':' >> ushort_[ boost::phoenix::ref( endpoint._port ) = _1 ] )
;
address %=
hostname;
}
Endpoint & endpoint_;
boost::spirit::qi::rule< ITERATOR, std::string() > start;
boost::spirit::qi::rule< ITERATOR, std::string() > address;
HostnameGrammar< ITERATOR > hostname;
};
int main()
{
std::vector< std::string > endpointStrings {
// these should parse successfully (and do)
"0foo", "foo", "foo.net", "foo:1234", "foo.net:5678", "foo.example.net", "foo.example.net:9012",
"foo-bar", "foo-bar.com", "foo-bar:1234", "foo-bar.net-0:5678", "foo-bar.example.com-1:9012",
// these should fail to parse (and do)
"foo.0bar", "foo.0bar:1234", "foo.bar-", "foo.bar-:1234", "foo-", "-foo"
};
for ( auto const & endpointString : endpointStrings )
{
Endpoint tempEndpoint;
std::string::const_iterator beginIt( endpointString.begin() );
std::string::const_iterator endIt( endpointString.end() );
EndpointGrammar< std::string::const_iterator > grammar( tempEndpoint );
if ( !boost::spirit::qi::parse( beginIt, endIt, grammar ) || beginIt != endIt )
{
std::cout << "Failed: " << endpointString << std::endl;
}
else
{
std::cout << "Succeeded: " << endpointString << " = " << tempEndpoint._host << " / " << tempEndpoint._port << std::endl;
}
}
return 0;
}
语法成功解析或未能解析所有它应该解析或未能按照 RFC 规则解析的示例,太棒了。问题是主机名的“最后”部分会“加倍”。这是运行的输出:
Succeeded: 0foo = 0foo0foo / 0
Succeeded: foo = foofoo / 0
Succeeded: foo.net = foo.netnet / 0
Succeeded: foo:1234 = foofoo / 1234
Succeeded: foo.net:5678 = foo.netnet / 5678
Succeeded: foo.example.net = foo.example.netnet / 0
Succeeded: foo.example.net:9012 = foo.example.netnet / 9012
Succeeded: foo-bar = foo-barfoo-bar / 0
Succeeded: foo-bar.com = foo-bar.comcom / 0
Succeeded: foo-bar:1234 = foo-barfoo-bar / 1234
Succeeded: foo-bar.net-0:5678 = foo-bar.net-0net-0 / 5678
Succeeded: foo-bar.example.com-1:9012 = foo-bar.example.com-1com-1 / 9012
Failed: foo.0bar
Failed: foo.0bar:1234
Failed: foo.bar-
Failed: foo.bar-:1234
Failed: foo-
Failed: -foo
注意“foofoo”、“foo.netnet”、“foo.example.netnet”等
我已经尝试了这些规则的大约 300 种不同的变体——太多了,无法在这里全部包含。可以说 many 变体成功地识别了有效主机名和无效主机名就够了,但是所有正确识别主机名的变体也都遇到了相同的重复最后部分问题。我在这里完全没有想法。任何人都知道为什么这不起作用以及如何解决它?
请注意,我什至尝试了最简单的仅字母规则来实现我想要实现的目标,以便使用更复杂的字符集扩展该规则,但即使这样也会将“foo.net”复制为“ foo.netnet":
start %=
+( +alpha >> char_( '.' ) ) >> +alpha
;
甚至不用看,我就知道您遇到了非原子属性传播到容器属性的问题,请参见例如
经典的解决方法是应用
qi::hold[]
——记住回溯下的性能成本。
使用一些 BOOST_SPIRIT_DEBUG* 宏,让我们放大
foo.netnet
案例:
tld = +qi::alpha >> *(-qi::char_('-') >> +qi::alnum);
singleName = +qi::alnum >> *(qi::char_('-') >> +qi::alnum);
dottedName = +(singleName >> qi::char_('.')) >> tld;
hostName = dottedName | singleName;
BOOST_SPIRIT_DEBUG_NODES((hostName)(dottedName)(singleName)(tld))
印刷品(摘录,Live On Coliru)
<hostName>
<try>foo.net</try>
<dottedName>
<try>foo.net</try>
<singleName>
<try>foo.net</try>
<success>.net</success>
<attributes>[[f, o, o]]</attributes>
</singleName>
<singleName>
<try>net</try>
<success></success>
<attributes>[[f, o, o, ., n, e, t]]</attributes>
</singleName>
<tld>
<try>net</try>
<success></success>
<attributes>[[f, o, o, ., n, e, t, n, e, t]]</attributes>
</tld>
<success></success>
<attributes>[[f, o, o, ., n, e, t, n, e, t]]</attributes>
</dottedName>
<success></success>
<attributes>[[f, o, o, ., n, e, t, n, e, t]]</attributes>
</hostName>
如您所见,第一个
singleName
包括.net
,只是回溯,因为后面没有更多的'.'
。然后tld
匹配。
修复它的“乏味”方法是使用
hold[]
(Live):
dottedName = +qi::hold[singleName >> qi::char_('.')] >> tld;
或者使用前瞻断言(效率较低!,Live)
dottedName = +(&(singleName >> '.') >> singleName >> qi::char_('.')) >> tld;
都工作。但是,由于您目前所做的只是将输入 1:1 传播到结果字符串,因此您可以使用
raw[]
、operator%
和文字而不是 char_
: 来做一切更简单的事情
tld = alpha >> *('-' >> alnum | alnum);
singleName = +alnum % '-';
dottedName = +(singleName >> '.') >> tld;
hostName = raw[dottedName | singleName];
在此设置中,
tld
、singleName
和 dottedName
根本不需要构建属性,因此回溯是无关紧要的。看到它使用完整的测试用例
#include <boost/phoenix.hpp>
#include <boost/spirit/include/qi.hpp>
#include <iomanip>
#include <iostream>
namespace qi = boost::spirit::qi;
namespace Ast {
struct Endpoint {
std::string host = "";
unsigned short port = 0;
auto operator<=>(Endpoint const&) const = default;
};
}
namespace Grammar {
template <typename It> struct Hostname : qi::grammar<It, std::string()> {
Hostname() : Hostname::base_type(hostName) {
tld = qi::alpha >> *('-' >> qi::alnum | qi::alnum);
singleName = +qi::alnum % '-';
dottedName = +(singleName >> '.') >> tld;
hostName = qi::raw[dottedName | singleName];
}
private:
qi::rule<It, std::string()> hostName;
qi::rule<It> dottedName, singleName, tld;
};
template <typename It> struct Endpoint : qi::grammar<It, std::string()> {
Endpoint(Ast::Endpoint& out) : Endpoint::base_type(endpoint) {
using namespace qi::labels;
namespace px = boost::phoenix;
address = hostname;
endpoint = //
-address[px::ref(out.host) = _1] >> //
-(':' >> qi::ushort_[px::ref(out.port) = _1]);
}
private:
Hostname<It> hostname;
qi::rule<It, std::string()> endpoint, address;
};
}
int main() {
using It = std::string_view::const_iterator;
struct {
std::string_view input;
Ast::Endpoint expected;
} cases[] = {
//// these should parse successfully (and do)
{"0foo", {"0foo", 0}},
{"foo", {"foo", 0}},
{"foo.net", {"foo.net", 0}},
{"foo:1234", {"foo", 1234}},
{"foo.net:5678", {"foo.net", 5678}},
{"foo.example.net", {"foo.example.net", 0}},
{"foo.example.net:9012", {"foo.example.net", 9012}},
{"foo-bar", {"foo-bar", 0}},
{"foo-bar.com", {"foo-bar.com", 0}},
{"foo-bar:1234", {"foo-bar", 1234}},
{"foo-bar.net-0:5678", {"foo-bar.net-0", 5678}},
{"foo-bar.example.com-1:9012", {"foo-bar.example.com-1", 9012}},
// these should fail to parse (and do)
{"foo.0bar", {}},
{"foo.0bar:1234", {}},
{"foo.bar-", {}},
{"foo.bar-:1234", {}},
{"foo-", {}},
{"-foo", {}},
};
for (auto [input, expected] : cases) {
Ast::Endpoint actual;
auto f(begin(input)), l(end(input));
Grammar::Endpoint<It> grammar(actual);
if (parse(f, l, grammar >> qi::eoi)) {
std::cout << (actual == expected ? "PASS\t" : "FAIL\t") << quoted(input) //
<< " -> " << actual.host << " / " << actual.port << "\n";
} else {
std::cout << (expected == Ast::Endpoint{} ? "PASS\t" : "FAIL\t") << quoted(input)
<< " (not a valid endpoint)\n";
}
}
}
印刷
PASS "0foo" -> 0foo / 0
PASS "foo" -> foo / 0
PASS "foo.net" -> foo.net / 0
PASS "foo:1234" -> foo / 1234
PASS "foo.net:5678" -> foo.net / 5678
PASS "foo.example.net" -> foo.example.net / 0
PASS "foo.example.net:9012" -> foo.example.net / 9012
PASS "foo-bar" -> foo-bar / 0
PASS "foo-bar.com" -> foo-bar.com / 0
PASS "foo-bar:1234" -> foo-bar / 1234
PASS "foo-bar.net-0:5678" -> foo-bar.net-0 / 5678
PASS "foo-bar.example.com-1:9012" -> foo-bar.example.com-1 / 9012
PASS "foo.0bar" (not a valid endpoint)
PASS "foo.0bar:1234" (not a valid endpoint)
PASS "foo.bar-" (not a valid endpoint)
PASS "foo.bar-:1234" (not a valid endpoint)
PASS "foo-" (not a valid endpoint)
PASS "-foo" (not a valid endpoint)
但是看着代码我注意到“无偿”语义动作。它们也不是原子的。在 SA 上看到这个主要内容:Boost Spirit:“语义行为是邪恶的”?
除非绝对必要,否则我强烈建议避免使用它们。这也使解析器无状态,这意味着您不需要为每个解析构建和编译规则/表达式。
Grammar::Hostname<It> hostname_;
for (auto [input, expected] : cases) {
Ast::Endpoint ep;
if (parse(begin(input), end(input), //
-hostname_ >> -(':' >> qi::ushort_) >> qi::eoi, //
ep.host, ep.port))
{
std::cout << (ep == expected ? "PASS\t" : "FAIL\t") << quoted(input) //
<< " -> " << ep.host << " / " << ep.port << "\n";
} else {
std::cout << (expected == Ast::Endpoint{} ? "PASS\t" : "FAIL\t") << quoted(input)
<< " (not a valid endpoint)\n";
}
}
与以前相同的输出,但代码更小,编译时间更短。
为了更容易传递您的应用程序 AST 类型而不是绑定单独的主机/端口字符串属性,您可能需要使用
std::pair
或调整您的自定义类型:
#include <boost/spirit/include/qi.hpp>
#include <iomanip>
#include <iostream>
namespace qi = boost::spirit::qi;
namespace Ast {
struct Endpoint {
std::string host = "";
unsigned short port = 0;
auto operator<=>(Endpoint const&) const = default;
};
}
BOOST_FUSION_ADAPT_STRUCT(Ast::Endpoint, host, port)
namespace Grammar {
template <typename It> struct Hostname : qi::grammar<It, std::string()> {
Hostname() : Hostname::base_type(hostName) {
tld = qi::alpha >> *('-' >> qi::alnum | qi::alnum);
singleName = +qi::alnum % '-';
dottedName = +(singleName >> '.') >> tld;
hostName = qi::raw[dottedName | singleName];
}
private:
qi::rule<It, std::string()> hostName;
qi::rule<It> dottedName, singleName, tld;
};
template <typename It> struct Endpoint : qi::grammar<It, Ast::Endpoint()> {
Endpoint() : Endpoint::base_type(endpoint) {
endpoint = -hostname >> -(':' >> qi::ushort_);
}
private:
Hostname<It> hostname;
qi::rule<It, Ast::Endpoint()> endpoint, address;
};
}
int main() {
using It = std::string_view::const_iterator;
struct {
std::string_view input;
Ast::Endpoint expected;
} cases[] = {
//// these should parse successfully (and do)
{"0foo", {"0foo", 0}},
{"foo", {"foo", 0}},
{"foo.net", {"foo.net", 0}},
{"foo:1234", {"foo", 1234}},
{"foo.net:5678", {"foo.net", 5678}},
{"foo.example.net", {"foo.example.net", 0}},
{"foo.example.net:9012", {"foo.example.net", 9012}},
{"foo-bar", {"foo-bar", 0}},
{"foo-bar.com", {"foo-bar.com", 0}},
{"foo-bar:1234", {"foo-bar", 1234}},
{"foo-bar.net-0:5678", {"foo-bar.net-0", 5678}},
{"foo-bar.example.com-1:9012", {"foo-bar.example.com-1", 9012}},
// these should fail to parse (and do)
{"foo.0bar", {}},
{"foo.0bar:1234", {}},
{"foo.bar-", {}},
{"foo.bar-:1234", {}},
{"foo-", {}},
{"-foo", {}},
};
static Grammar::Endpoint<It> const grammar;
for (auto [input, expected] : cases) {
Ast::Endpoint ep;
if (parse(begin(input), end(input), grammar >> qi::eoi, ep)) {
std::cout << (ep == expected ? "PASS\t" : "FAIL\t") << quoted(input) //
<< " -> " << ep.host << " / " << ep.port << "\n";
} else {
std::cout << (expected == Ast::Endpoint{} ? "PASS\t" : "FAIL\t") << quoted(input)
<< " (not a valid endpoint)\n";
}
}
}
仍然具有与以往相同的输出。请注意语法是无状态的。