除了标准库中包含的标准库(例如 StreamTokenizer 和 StringTokenizer)之外,我对 java 是否有任何好的分词器感到好奇,在我看来,这些分词器并不是很好。
我唯一知道的是jTopas,非常棒。那么,还有什么?
***让我们从建议中排除 ANTLR。另外,我知道从头开始编写一个相当容易,但这样做了几次后,我发现我更喜欢使用通用的东西。
我真的很喜欢ANTLR,尽管如果您只需要执行标记化,它可能会相当重量级 - 但如果您需要词法分析器/解析器,尤其是在 Java 中,它就很棒。
虽然jdk里已经包含了,但是你看过java.util.Scanner吗?我前段时间也用过JFlex,发现还不错。
在此给出一个与标记器和解析器相关的示例:
代币:
public class Token {
public enum Type {
LOAD, FROM, SAVE, TO, TERMINATOR, PARAMETER;
}
private final Type type;
private final String value;
public Token(Type type, String value) {
this.value = value;
this.type = type;
}
public Type getType() {
return type;
}
public String getValue() {
return value;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o instanceof Token) {
Token t = (Token) o;
if (this.type == t.type && this.value.equals(t.value)) {
return true;
}
}
return false;
}
}
标记器:
public class Tokeniser {
private String buffer; // save text
private Token currentToken; // save token extracted from next()
public Tokeniser(String text) {
buffer = text; // save input text (string)
next(); // extracts the first token.
}
/**
* This function will find and extract a next token from {@code buffer} and save
* the token to {@code currentToken}.
*/
public void next() {
buffer = buffer.trim(); // remove whitespace
if (buffer.isEmpty()) {
currentToken = null; // if there's no string left, set currentToken null and return
return;
}
if(buffer.toUpperCase().startsWith("LOAD")){
currentToken = new Token(Token.Type.LOAD, buffer.substring(0,4));
}else if(buffer.toUpperCase().startsWith("FROM")){
currentToken = new Token(Token.Type.FROM,buffer.substring(0,4));
}else if(buffer.toUpperCase().startsWith("SAVE")){
currentToken = new Token(Token.Type.SAVE,buffer.substring(0,4));
}else if(buffer.toUpperCase().startsWith("TO")){
currentToken = new Token(Token.Type.TO,buffer.substring(0,2));
}else if(buffer.startsWith(";")){
currentToken = new Token(Token.Type.TERMINATOR,";");
}else{
int spaceIndex = buffer.indexOf(" ");
int semicolonIndex = buffer.indexOf(";");
if(spaceIndex==-1&&semicolonIndex==-1){
currentToken = new Token(Token.Type.PARAMETER, buffer);
}else if(spaceIndex==-1||(semicolonIndex!=-1&&semicolonIndex<spaceIndex)){
currentToken = new Token(Token.Type.PARAMETER,buffer.substring(0,semicolonIndex));
}else{
currentToken = new Token(Token.Type.PARAMETER, buffer.substring(0,spaceIndex));
}
}
// Remove the extracted token from buffer
int tokenLen = currentToken.getValue().length();
buffer = buffer.substring(tokenLen);
}
/**
* returned the current token extracted by {@code next()} **** please do not
* modify this part ****
*
* @return type: Token
*/
public Token current() {
return currentToken;
}
/**
* check whether there still exists another tokens in the buffer or not ****
* please do not modify this part ****
*
* @return type: boolean
*/
public boolean hasNext() {
return currentToken != null;
}
}
解析器:
import java.util.LinkedList;
import java.util.List;
public class Parser {
private final Tokeniser tokeniser;
public Parser(Tokeniser tokeniser) {
this.tokeniser = tokeniser;
}
public List<Command> parseCmds() {
List<Command> commands = new LinkedList<>();
while(tokeniser.hasNext()){
Token token = tokeniser.current();
Command command = null;
switch (token.getType()){
case LOAD:
tokeniser.next();
String key = tokeniser.current().getValue();
tokeniser.next();
tokeniser.next();
String value = tokeniser.current().getValue();
command = new LoadCommand(key,value);
break;
case SAVE:
tokeniser.next();
String key2 = tokeniser.current().getValue();
tokeniser.next();
tokeniser.next();
String value2 = tokeniser.current().getValue();
command = new SaveCommand(key2,value2);
break;
}
tokeniser.next();
if(command!=null){
commands.add(command);
}
}
return commands;
}
}