WebSVN - mkgmap - Rev 4432 - /trunk/src/uk/me/parabola/mkgmap/srt/SrtTextReader.java

/*
* Copyright (C) 2010, 2011.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 3 or
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/

package uk.me.parabola.mkgmap.srt;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import uk.me.parabola.imgfmt.ExitException;
import uk.me.parabola.imgfmt.app.srt.CodePosition;
import uk.me.parabola.imgfmt.app.srt.SRTFile;
import uk.me.parabola.imgfmt.app.srt.Sort;
import uk.me.parabola.imgfmt.fs.ImgChannel;
import uk.me.parabola.imgfmt.sys.FileImgChannel;
import uk.me.parabola.mkgmap.scan.SyntaxException;
import uk.me.parabola.mkgmap.scan.TokType;
import uk.me.parabola.mkgmap.scan.Token;
import uk.me.parabola.mkgmap.scan.TokenScanner;

/**
* Read in a sort file from a text format.
*
* The file is in utf-8, regardless of the target codepage.
*
* The file should start with a codepage declaration, which determines the
* target codepage for the sort. This can be followed by a description which is
* added into the SRT file.
*
* The characters are listed in order arranged in a way that shows the strength of the
* difference between the characters. These are:
*
* Primary difference - different letters (eg a and b)
* Secondary difference - different accents (eg a and a-acute)
* Tertiary difference - different case (eg a and A)
*
* The sort order section begins with the word 'code'.
*
* Primary differences are represented by the less-than separator.
* Secondary differences are represented by the semi-colon separator.
* Tertiary differences are represented by the comma separator.
*
* Characters are represented in <emphasis>unicode (utf-8)</emphasis> (the whole file must be in utf-8).
* Or alternatively you can use a four hex-digit number. A few special punctuation characters must
* be written that way to prevent them being mistaken for separators.
*
* Example
* <pre>
* # This is a comment
* codepage 1252
* description "Example sort"
* code a, A; â Â
* < b, B
* # Last two lines could be written:
* # code a, A; â, Â < b, B
* </pre>
*
* @author Steve Ratcliffe
*/
public class SrtTextReader {

// States
private static final int IN_INITIAL = 0;
private static final int IN_CHARACTER = 1;
private static final int IN_EXPAND = 2;
private static final boolean EXPERIMENTAL = false;

// Data that is read in, the output of the reading operation
private final Sort sort = new Sort();

private CharsetEncoder encoder;

// Used during parsing.
private int pos1;
private int pos2;
private int pos3;
private int state;
private String cflags = "";
private Map<Integer, Integer> maxSec;
private Map<Integer, Integer> maxTert;
private List<CodePosition> expansions;

public SrtTextReader(Reader r) {
this("stream", r);
}

private SrtTextReader(String filename) throws IOException {
this(filename, new InputStreamReader(new FileInputStream(filename), StandardCharsets.UTF_8));
}

private SrtTextReader(String filename, Reader r) {
maxSec = new HashMap<>();
maxTert = new HashMap<>();
expansions = new ArrayList<>();
read(filename, r);
maxSec = null;
maxTert = null;
expansions = null;
}

/**
* Find and read in the default sort description for the given codepage.
*/
public static Sort sortForCodepage(int codepage) {
String name = "sort/cp" + codepage + ".txt";
try (InputStream is = Sort.class.getClassLoader().getResourceAsStream(name)) {
if (is == null) {
if (codepage == 1252)
throw new ExitException("No sort description for code-page 1252 available");

Sort defaultSort = SrtTextReader.sortForCodepage(1252);
defaultSort.setCodepage(codepage);
defaultSort.setDescription("Default sort");
return defaultSort;
}

InputStreamReader r = new InputStreamReader(is, StandardCharsets.UTF_8);
SrtTextReader sr = new SrtTextReader(r);
return sr.getSort();
} catch (IOException e) {
throw new ExitException("Exception " + e + " getting sortForCodepage " + codepage);
}
}

/**
* Read in a file and save the information in a form that can be used
* to compare strings.
* @param filename The name of the file, used for display purposes. It need
* not refer to a file that actually exists.
* @param r The opened file or other readable source.
* @throws SyntaxException If the format of the file is incorrect.
*/
public void read(String filename, Reader r) {
TokenScanner scanner = new TokenScanner(filename, r);
resetPos();
state = IN_INITIAL;
while (!scanner.isEndOfFile()) {
Token tok = scanner.nextToken();

// We deal with whole line comments here
if (tok.isValue("#")) {
scanner.skipLine();
continue;
}

switch (state) {
case IN_INITIAL:
initialState(scanner, tok);
break;
case IN_CHARACTER:
characterState(scanner, tok);
break;
case IN_EXPAND:
expandState(scanner, tok);
break;
}
}
sort.setExpansions(expansions);
sort.finish();
}

/**
* The initial state, looking for a variable to set or a command to change
* the state.
* @param scanner The scanner for more tokens.
* @param tok The first token to process.
*/
private void initialState(TokenScanner scanner, Token tok) {
String val = tok.getValue();
TokType type = tok.getType();
if (type == TokType.TEXT) {
switch (val) {
case "codepage":
int codepage = scanner.nextInt();
sort.setCodepage(codepage);
encoder = sort.getCharset().newEncoder();
break;
case "description":
sort.setDescription(scanner.nextWord());
break;
case "id1":
sort.setId1(scanner.nextInt());
break;
case "id2":
sort.setId2(scanner.nextInt());
break;
case "multi":
sort.setMulti(true);
break;

case "code": // The old name; use characters
case "characters":
if (encoder == null)
throw new SyntaxException(scanner, "Missing codepage declaration before code");
state = IN_CHARACTER;
scanner.skipSpace();
break;
case "expand":
state = IN_EXPAND;
scanner.skipSpace();
break;
default:
throw new SyntaxException(scanner, "Unrecognised command " + val);
}
}
}

/**
* Block consisting of characters and relations between them.
*
* The sort order is derived from this.
*
* @param scanner The scanner for more tokens.
* @param tok The current token to process.
*/
private void characterState(TokenScanner scanner, Token tok) {
String val = tok.getValue();
TokType type = tok.getType();
if (type == TokType.TEXT) {
switch (val) {
case "flags":
scanner.validateNext("=");
cflags = scanner.nextWord();
// TODO not yet
break;
case "pos": // Used to set the actual sort position value, not used any more
scanner.validateNext("=");
try {
int newPos = Integer.decode(scanner.nextWord());
if (newPos < pos1)
throw new SyntaxException(scanner, "cannot set primary position backwards, was " + pos1);
pos1 = newPos;
} catch (NumberFormatException e) {
throw new SyntaxException(scanner, "invalid integer for position");
}
break;
case "pos2": // Used to set the actual sort position value, not used any more
scanner.validateNext("=");
pos2 = Integer.decode(scanner.nextWord());
break;
case "pos3": // Used to set the actual sort position value, not used any more
scanner.validateNext("=");
pos3 = Integer.decode(scanner.nextWord());
break;
case "code": // the old name, use 'characters'
case "characters":
advancePos();
break;
case "expand":
//scanner.pushToken(tok);
state = IN_EXPAND;
break;
default:
addCharacter(scanner, val);
break;
}
} else if (type == TokType.SYMBOL) {
switch (val) {
case "=":
break;
case ",":
pos3++;
break;
case ";":
pos3 = 1;
pos2++;
break;
case "<":
advancePos();
break;
default:
addCharacter(scanner, val);
break;
}

}
}

/**
* Within an 'expand' command. The whole command is read before return, they can not span
* lines.
*
* @param tok The first token after the keyword.
*/
private void expandState(TokenScanner scanner, Token tok) {
String val = tok.getValue();

Code code = new Code(scanner, val).read();

String s = scanner.nextValue();
if (!"to".equals(s))
throw new SyntaxException(scanner, "Expected the word 'to' in expand command");

int secondary = 0;
int tertiary = 0;
int num = 0;
while (!scanner.isEndOfFile()) {
Token t = scanner.nextRawToken();
if (t.isEol())
break;
if (t.isWhiteSpace())
continue;

Code r = new Code(scanner, t.getValue()).read();

CodePosition cp = new CodePosition();
int b = r.getBval();
int primary = sort.getPrimary(b);
cp.setPrimary(primary);

// We do not want the character to sort fully equal to the expanded characters (or any other
// character so adjust the ordering at other strengths. May need further tweaks.
if (EXPERIMENTAL) {
secondary = sort.getSecondary(b);
tertiary = sort.getTertiary(b);
if (num++ == 0) {
Integer max = maxSec.get(primary);
secondary += max == null ? 0 : max;
if (charFlags(code) == 1) {
max = maxTert.get(primary);
tertiary += max == null ? 0 : max;
}
} else {
secondary = 1;
}
cp.setSecondary(secondary);
cp.setTertiary(tertiary);
} else {
num++;
secondary = sort.getSecondary(b);
cp.setSecondary(secondary + 7);

tertiary = sort.getTertiary(b);
cp.setTertiary(tertiary + 2);
}
expansions.add(cp);
}

int flags = charFlags(code) | (num-1) << 4;
sort.add(code.getBval(), expansions.size() - num + 1, 0, 0, flags);
state = IN_INITIAL;
}

/**
* Add a character to the sort table.
* @param scanner Input scanner, for line number information.
* @param val A single character string containing the character to be added. This will
* be either a single character which is the unicode representation of the character, or
* two or more characters which is the hex representation of the code point in the target codepage.
*/
private void addCharacter(TokenScanner scanner, String val) {
Code code = new Code(scanner, val).read();
setSortcode(code);
}

/**
* Set the sort code for the given 8-bit character.
* @param c the code instance
*/
private void setSortcode(Code c) {
int flags = charFlags(c);
if (cflags.contains("0"))
flags = 0;
sort.add(c.getBval(), pos1, pos2, pos3, flags);
this.cflags = "";

if (EXPERIMENTAL) {
Integer max = maxSec.get(pos1);
if (max == null)
max = pos2;
else
max = Math.max(pos2, max);
maxSec.put(pos1, max);
max = maxTert.get(pos1);
if (max == null)
max = pos3;
else
max = Math.max(pos3, max);
maxTert.put(pos3, max);
}
}

/**
* The flags that describe the kind of character. Known ones
* are letter and digit. There may be others.
* @param c The actual code instance
* @return The flags that apply to it.
*/
private int charFlags(Code c) {
int flags = 0;
if (c.getBval() <= 0xff) {
int ch = c.getCval();
if (ch == 'ª' || (Character.isLetter(ch) && (Character.getType(ch) & Character.MODIFIER_LETTER) == 0))
flags = 1;
if (Character.isDigit(ch))
flags = 2;
}
return flags;
}

/**
* Reset the position fields to their initial values.
*/
private void resetPos() {
pos1 = 0;
pos2 = 0;
pos3 = 0;
}

/**
* Advance the major position value, resetting the minor position variables.
*/
private void advancePos() {
if (pos1 == 0)
pos1 = 1;
else
pos1 += pos2;
pos2 = 1;
pos3 = 1;
}

public Sort getSort() {
return sort;
}

/**
* Read in a sort description text file and create a SRT from it.
* @param args First arg is the text input file, the second is the name of the output file. The defaults are
* in.txt and out.srt.
*/
public static void main(String[] args) throws IOException {
String infile = "in.txt";
if (args.length > 0)
infile = args[0];

String outfile = "out.srt";
if (args.length > 1)
outfile = args[1];
try {
Files.delete(Paths.get(outfile, ""));
} catch (Exception e) {
}
ImgChannel chan = new FileImgChannel(outfile, "rw");
SRTFile sf = new SRTFile(chan);

SrtTextReader tr = new SrtTextReader(infile);
Sort sort1 = tr.getSort();
sf.setSort(sort1);
sf.write();
sf.close();
chan.close();
}

/**
* Helper to represent a code read from the file.
*
* You can write it in unicode, or as a hex number.
* We work out what you wrote, and return both the code point in
* the codepage and the unicode character form of the letter.
*/
private class Code {
private final TokenScanner scanner;
private final String val;
private int cval;
private int bval;

public Code(TokenScanner scanner, String val) {
this.scanner = scanner;
this.val = val;
}

/**
* Get the character encoded in the code-page encoding.
*
* It will be one byte for the format-9 code pages cp1252 etc.
* @return A character encoded in the code-page.
*/
public int getBval() {
return bval;
}

/**
* Get the character in unicode.
*
* It will in general be a 2 byte value.
*
* @return The character expressed in unicode.
*/
public int getCval() {
return cval;
}

public Code read() {
try {
if (val.length() == 1) {
cval = val.charAt(0);
} else {
cval = Integer.parseInt(val, 16);
}

if (sort.isMulti()) {
bval = cval;
} else {
CharBuffer cbuf = CharBuffer.wrap(new char[] {(char) cval});
ByteBuffer out = encoder.encode(cbuf);
if (out.remaining() > 1)
throw new SyntaxException(scanner, "more than one character resulted from conversion of " + val);

bval = out.get() & 0xff;
}

} catch (NumberFormatException e) {
throw new SyntaxException(scanner, "Not a valid hex number " + val);
} catch (CharacterCodingException e) {
throw new SyntaxException(scanner, "Character not valid in character set '" + val + "'");
}
return this;
}

public String toString() {
return String.format("%x", cval);
}
}
}

Subversion Repositories mkgmap

(root)/trunk/src/uk/me/parabola/mkgmap/srt/SrtTextReader.java - Rev 4432