Rev 4399 |
Blame |
Compare with Previous |
Last modification |
View Log
| RSS feed
/*
* Copyright (C) 2008,2014 Steve Ratcliffe
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*
* Author: Steve Ratcliffe
* Create date: May 10, 2008
*/
package uk.me.parabola.mkgmap.scan;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.LinkedList;
/**
* Read a file in terms of word and symbol tokens.
*
* @author Steve Ratcliffe
*/
public class TokenScanner
{
private static final int NO_PUSHBACK =
0;
private String charset =
"utf-8";
// Reading state
private final Reader reader
;
private int pushback = NO_PUSHBACK
;
private boolean isEOF
;
private final String fileName
;
private int linenumber
;
private final LinkedList<Token
> tokens =
new LinkedList<>();
private boolean bol =
true;
// Extra word characters.
private String extraWordChars =
"";
private String commentChar =
"#";
public TokenScanner
(String filename,
Reader reader
) {
if (reader
instanceof BufferedReader)
this.
reader = reader
;
else
this.
reader =
new BufferedReader(reader
);
fileName = filename
;
}
public void setCharset
(String charset
) {
this.
charset = charset
;
}
/**
* Peek and return the first token. It is not consumed.
*/
public Token peekToken
() {
ensureTok
();
return tokens.
peek();
}
/**
* Get and remove the next token. May return space or newline. This is the
* only place that a token is removed from the tokens queue.
*/
public Token nextRawToken
() {
ensureTok
();
if (bol
) {
bol =
false;
linenumber++
;
}
Token token = tokens.
removeFirst();
if (token.
getType() == TokType.
EOL)
bol =
true;
return token
;
}
/**
* Get the next token tht is not a space or newline.
* @return The first valid text or symbol token.
*/
public Token nextToken
() {
skipSpace
();
return nextRawToken
();
}
/**
* Push a token back to the beginning of the token queue.
* @param tok The token to add to the beginning of the queue.
*/
public void pushToken
(Token tok
) {
tokens.
push(tok
);
}
/**
* Get the value of the next non-space token and consume the token. You'd
* probably only call this after having peeked the type earlier.
* Any initial space is skipped.
*/
public String nextValue
() {
skipSpace
();
return nextRawToken
().
getValue();
}
public boolean isEndOfFile
() {
ensureTok
();
if (tokens.
isEmpty()) {
return isEOF
;
} else {
return tokens.
peek().
getType() == TokType.
EOF;
}
}
/**
* Skip any white space. After calling this the next token
* will be end of file or something other than SPACE or EOL.
*/
public void skipSpace
() {
while (!isEndOfFile
()) {
if (tokens.
peek().
isValue(commentChar
)) {
skipLine
();
continue;
}
if (!tokens.
peek().
isWhiteSpace())
break;
nextRawToken
();
}
}
/**
* Skip everything up to a new line token. The new line
* token will be consumed, so the next token will the the first
* on a new line (or at EOF).
*/
public void skipLine
() {
while (!isEndOfFile
()) {
Token t = nextRawToken
();
if (t.
getType() == TokType.
EOL)
break;
}
}
private void ensureTok
() {
if (tokens.
isEmpty())
fillTok
();
}
private void fillTok
() {
Token t = readTok
();
tokens.
add(t
);
}
/**
* Read a token from the input stream. There are only a few
* kinds of token that are recognised on input. Other token
* types are recognised or constructed later on.
* @return A token. Never returns null or throws an exception.
* Once end of file or an error occurs the routine will always return
* EOF.
*/
private Token readTok
() {
if (isEOF
)
return new Token
(TokType.
EOF);
int c = readChar
();
if (c == -
1) {
isEOF =
true;
return new Token
(TokType.
EOF);
}
StringBuilder val =
new StringBuilder();
val.
append((char) c
);
TokType tt
;
if (c ==
'\r') {
c = readChar
();
if (c
!=
'\n')
pushback = c
;
tt = TokType.
EOL;
} else if (c ==
'\n') {
tt = TokType.
EOL;
} else if (isSpace
(c
)) {
while (isSpace
(c = readChar
()) && c
!=
'\n')
val.
append((char) c
);
pushback = c
;
tt = TokType.
SPACE;
} else if (isWordChar
(c
)) {
while (isWordChar
(c = readChar
()))
val.
append((char) c
);
pushback = c
;
tt = TokType.
TEXT;
} else {
// A symbol. The value has already been set. Some symbols
// combine from multiple characters.
if (c ==
'!' || c ==
'<' || c ==
'>') {
c = readChar
();
if (c ==
'=')
val.
append('=');
else
pushback = c
;
} else if (c ==
'&' || c ==
'|') {
// Allow && and || as single symbols
int c2 = readChar
();
if (c2 == c
)
val.
append((char) c2
);
else
pushback = c2
;
}
tt = TokType.
SYMBOL;
}
Token t =
new Token
(tt
);
t.
setValue(val.
toString());
return t
;
}
/**
* Read a single character.
* @return The next character, or -1 if at EOF. The isEOF field will also be set to true at end of file.
*/
private int readChar
() {
int c
;
if (pushback
!= NO_PUSHBACK
) {
c = pushback
;
pushback = NO_PUSHBACK
;
return c
;
}
try {
c = reader.
read();
if (c == 0xfffd
)
throw new SyntaxException
(this,
"Bad character in input, file probably not in " + charset
);
} catch (IOException e
) {
isEOF =
true;
c = -
1;
}
return c
;
}
private boolean isSpace
(int nextch
) {
return Character.
isWhitespace(nextch
) || nextch ==
'\uFEFF';
}
private boolean isWordChar
(int ch
) {
return Character.
isLetterOrDigit(ch
)
|| ch ==
'_'
|| extraWordChars.
indexOf(ch
) >=
0;
}
/**
* Read the tokens up until the end of the line and combine then
* into one string.
*
* @return A single string, not including the newline terminator. Never
* returns null, returns an empty string if there is nothing there. The
* end of line is consumed.
*/
public String readLine
() {
String res = readUntil
(TokType.
EOL,
null);
nextRawToken
(); // use up new line
return res
;
}
/**
* Read tokens until one of the given type and value is found and return the result as a single string.
* The searched token is not consumed from the input.
*
* @param type The token type to search for.
* @param value The string value of the token to search for.
* @return A single string of all the tokens preceding the searched token.
*/
public String readUntil
(TokType type,
String value
) {
StringBuilder sb =
new StringBuilder();
while (!isEndOfFile
()) {
Token t = peekToken
();
if (t.
getType() == type
&& (value ==
null || value.
equals(t.
getValue())))
break;
sb.
append(nextRawToken
().
getValue());
}
return sb.
toString().
trim();
}
/**
* Convenience routine to get an integer. Skips space and reads a
* token. This token is converted to an integer if possible.
* @return An integer as read from the next non space token.
* @throws NumberFormatException When the next symbol isn't
* a valid integer.
*/
public int nextInt
() throws NumberFormatException {
skipSpace
();
Token t = nextRawToken
();
if (t ==
null)
throw new NumberFormatException("no number");
return Integer.
parseInt(t.
getValue());
}
/**
* As {@link #nextWordWithInfo()} but just the string is returned.
* @return The next word as a string. A quoted entity is regarded as a
* word for the purposes of this scanner.
*/
public String nextWord
() {
WordInfo info = nextWordWithInfo
();
return info.
getText();
}
/**
* Read a string that can be quoted. If it is quoted, then everything
* until the closing quotes is part of the string. Both single
* and double quotes can be used.
*
* If there are no quotes then it behaves like nextToken apart from
* skipping space.
*
* Initial and final space is skipped.
*
* The word string is returned along with a flag to indicate whether it
* was quoted or not.
*/
public WordInfo nextWordWithInfo
() {
skipSpace
();
Token tok = peekToken
();
char quotec =
0;
if (tok.
getType() == TokType.
SYMBOL) {
String s = tok.
getValue();
if ("'".
equals(s
) ||
"\"".
equals(s
)) {
quotec = s.
charAt(0);
nextRawToken
();
}
}
StringBuilder sb =
new StringBuilder();
while (!isEndOfFile
()) {
tok = nextRawToken
();
if (quotec ==
0) {
sb.
append(tok.
getValue());
break;
} else {
if (tok.
isValue(String.
valueOf(quotec
)))
break;
sb.
append(tok.
getValue());
}
}
skipSpace
();
return new WordInfo
(sb.
toString(), quotec
!=
0);
}
/**
* Check the value of the next non-space token without consuming it.
*
* Any white space will be consumed
*
* @param val String value to compare against.
* @return True if the next token has the same value as the argument.
*/
public boolean checkToken
(String val
) {
skipSpace
();
Token tok = peekToken
();
if (val ==
null || tok.
getValue() ==
null)
return false;
return val.
equals(tok.
getValue());
}
/**
* Validate the next word is the given value. Space is skipped before
* checking, the checked value is consumed. Use when you want to
* ensure that a required syntax element is present.
*
* The input will either be positioned after the required word or an
* exception will have been thrown.
*
* @param val The string value to look for.
* @throws SyntaxException If the required string is not found.
*/
public void validateNext
(String val
) {
skipSpace
();
Token tok = nextToken
();
if (val ==
null ||
!val.
equals(tok.
getValue()))
throw new SyntaxException
(this,
"Expecting " + val +
", instead saw " + tok.
getValue());
}
public int getLinenumber
() {
return linenumber
;
}
public String getFileName
() {
return fileName
;
}
/**
* Extra word characters are characters that should be considered as part of a word in addition
* to alphanumerics and underscore.
* @param extraWordChars A string containing all the characters to be considered part of a word.
*/
public void setExtraWordChars
(String extraWordChars
) {
this.
extraWordChars = extraWordChars
;
}
/**
* The skip space routine, will skip all characters after a '#' until the end of the
* line as part of its skip white space functionality.
*
* This is a mis-feature if your comment character is not '#' or that character is
* sometimes important. Therefore you can turn this off by passing in an empty string here.
*/
public void setCommentChar
(String commentChar
) {
if (commentChar ==
null)
this.
commentChar =
"";
else
this.
commentChar = commentChar
;
}
}