WebSVN - display - Rev 583 - /trunk/src/test/display/SrtDisplay.java

/*
* Copyright (C) 2007 Steve Ratcliffe
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*
* Author: Steve Ratcliffe
* Create date: Dec 16, 2007
*/
package test.display;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.ArrayList;
import java.util.Formatter;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NavigableSet;
import java.util.TreeSet;

import uk.me.parabola.imgfmt.app.srt.Sort;

/**
* Standalone program to display the SRT file. This is used for
* the sorting order for different charsets apparently.
*
* @author Steve Ratcliffe
*/
@SuppressWarnings("MalformedFormatString") // bug, not actually an error for %c to have int
public class SrtDisplay extends CommonDisplay {
private Section description;
private Section tableHeader;

private int srt1start;

private Section characterTable;
private Section srt5;
private Section srt7;
private Section srt8;
private CharsetDecoder decoder;

private final List<CharPosition> expansions = new ArrayList<>();
private final NavigableSet<CharPosition> charmap = new TreeSet<>();

private boolean isUnicode;

private final Map<Integer, Integer> offsetToBlock = new HashMap<>();

private String srtDescription;
private int codepage;
private int id1;
private int id2;

protected void print() {
readCommonHeader();
readFileHeader();
printBody();
}

/**
* This header is unusual as doesn't follow the normal header conventions for
* defining the sections. It points to a single section that contains a header
* which in turn points to other sections.
*/
private void readFileHeader() {
Displayer d = new Displayer(reader);
d.setTitle("SRT Header");

d.charValue("??? %d"); //NUmber of sections?? Seems to be true, I've only one section, value here 0x01
srt1start = d.intValue("SRT 1 start");
d.charValue("len %d");

if (getHeaderLen() == 37) {
d.charValue("??? %d");
d.intValue("SRT 2 start %x");
d.charValue("len %d");
}
d.print(outStream);
}

private void printBody() {
printSrt1();
//printSrt2();
printDescription();
tableHeader();

printSrt5();
printCharacterTable();
printSrt7();
printSrt8();

// Show the actual sort order
printCharMap();
}

/**
* This section has regular section headers like the other app files.
*/
private void printSrt1() {
Displayer d = new Displayer(reader);
reader.position(srt1start);

d.setTitle("SRT 1 (pointers)");
description = readSection(d, "Description", 2, false, false);
tableHeader = readSection(d, "Table header", 3, false, false);

d.print(outStream);
}

private void printDescription() {
Displayer d = new Displayer(reader);
reader.position(description.getStart());

d.setTitle("Description");

srtDescription = d.zstringValue("Description: %s");

long remain = description.getLen() - srtDescription.length() - 1;
d.rawValue((int) remain);

d.print(outStream);
}

/**
* This is a strange section, because it also has a header with multiple sections.
*/
private void tableHeader() {
Displayer d = new Displayer(reader);
d.setTitle("Character table header");

long start = tableHeader.getStart();
d.setSectStart(start);
reader.position(start);
int len = d.charValue("sub header len %d");
id1 = d.charValue("id1 %d");
id2 = d.charValue("id2 %d");

codepage = d.charValue("codepage %d");
if (codepage == 65001)
isUnicode = true;
Charset charset = Sort.charsetFromCodepage(codepage);
decoder = charset.newDecoder();
d.intValue("??? %d");

characterTable = readSection(d, "SRT 4 character table", 4, true, false);
d.rawValue(6, "padding?");

srt5 = readSection(d, "SRT 5 expansions", 5, true, false);
d.rawValue(6, "padding?");

if (len > 0x2c) {
readSection(d, "SRT 6 ??", 6, false, false);
}

// For multi byte character encodings.
if (len > 0x34) {
d.intValue("??");
d.intValue("max code block (in srt7) %d");
srt7 = readSection(d, "SRT 7 srt8 ptrs", 7, true, false);
d.charValue("");
d.intValue("");
}
if (len > 0x44) {
srt8 = readSection(d, "SRT 8 multi-byte chars", 8, true, false);
}
d.rawValue((int) (len - (reader.position() - start)), "remainder");
d.print(outStream);
}

/**
* The main character table. You look up the character position in this table and
* it gives you the primary, secondary and tertiary sorting weights. There are
* also a couple of flags to mark numbers and letters. Some entries refer to the
* expansion table - these are letters or symbols that sort as two or more characters.
*/
private void printCharacterTable() {
Displayer d = new Displayer(reader);
d.setTitle("SRT 4 Character table");

int rs = characterTable.getRecordSize();
long start = tableHeader.getStart() + characterTable.getStart();
d.setSectStart(tableHeader.getStart());
reader.position(start);

for (int i = 1; i <= characterTable.getNumberOfRecords(); i++) {
CharPosition c = printCharPosition(d, rs, i);

charmap.add(c);

d.print(outStream);
d.setTitle(null);
}
}

/**
* Now that we have all the characters read, print them out showing the sorting.
*/
private void printCharMap() {
Displayer d = new Displayer(reader);
d.setTitle("------- Summary of ordering --------");

Formatter chars = new Formatter();
//Formatter comment = new Formatter();

// reproduce header like mkgmap resource/sort/cp*.txt entries
chars.format("\n\n\n");
chars.format("# Compare this with resource/sort/cp%d.txt.\n\n", codepage);
chars.format("codepage %d\n", codepage);
chars.format("id1 %d\n", id1);
chars.format("id2 %d\n", id2);
chars.format("description \"%s\"\n\n", srtDescription);
chars.format("characters\n\n");

CharPosition last = new CharPosition(0);
//last.first = -1;
last.first = 0; // start first line with zero/ignore sortOrder
for (CharPosition cp : charmap) {
if (cp.expands > 0)
continue;
int unicodeChar = toUnicode(cp.val);
if (unicodeChar < 0) // no character defined for this position
continue;

if (cp.first != last.first) {
//chars.format(" # %s\n[%d] < ", comment, cp.first);
chars.format("\n < ");
//comment = new Formatter();
} else if (cp.second != last.second) {
chars.format(" ; ");
//comment.format(" ; ");
} else if (cp.third != last.third) {
chars.format(",");
//comment.format(",");
} else {
chars.format("=");
//comment.format("=");
}
last = cp;
chars.format("%s", fmtChar(unicodeChar));
//comment.format("U+%04x", cp.val);
}

chars.format("\n");
for (CharPosition cp : charmap) {
if (cp.expands > 0) {
chars.format("expand %s to ", fmtChar(toUnicode(cp.val)));
for (int i = 0; i <= cp.expands; ++i) {
CharPosition ch = expansions.get(cp.first + i - 1);
// need to search for best char with this first/primary. Doesn't actually matter
// apart from the cosmetics of the sort/cp*.txt expand list because the secondary
// and tertiary binary sortOrders are chosen to avoid matching existing real chars.
// see mkgmap/srt/SrtTextReader.java for more info
if (ch.second > 7)
ch.second -= 7;
ch.third = ch.third >= 5 ? 2 : 1;
int charValue = -1;
for (CharPosition scanCp : charmap) {
if (scanCp.expands > 0)
continue;
if (scanCp.first == ch.first) {
if (scanCp.second == ch.second &&
scanCp.third == ch.third) {
charValue = scanCp.val;
break;
} else if (charValue < 0) {
charValue = scanCp.val;
}
}
}
if (charValue >= 0)
charValue = toUnicode(charValue);
if (charValue >= 0)
chars.format(" %c", charValue);
}
chars.format("\n");
}
}
chars.format("\n# ends\n", codepage);

d.item().addText(chars.toString());
d.print(outStream);
}

private String fmtChar(int val) {
boolean asChar = true;
switch (val) {
case '<':
case ';':
case ',':
case '=':
case '#':
asChar = false;
break;
default:

switch (Character.getType(val)) {
case Character.UNASSIGNED:
case Character.NON_SPACING_MARK:
case Character.FORMAT:
case Character.CONTROL:
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
asChar = false;
}
}

if (asChar) {
//noinspection MalformedFormatString
return String.format("%c", val);
} else {
return String.format("%04x", val);
}
}

private CharPosition printCharPosition(Displayer d, int reclen, int charValue) {
DisplayItem item = d.item();

int flags = reader.get1u();
item.setBytes1(flags);

CharPosition c = readCharPosition(item, reclen-1, charValue);

StringBuilder sb = new StringBuilder();
Formatter fmt = new Formatter(sb);
fmt.format("0x%02x ", charValue);
int unicodeChar = toUnicode(charValue);
if (unicodeChar < 0) // no character defined for this position
fmt.format("NaC ");
else
fmt.format("(%c) ", unicodeChar);
if ((flags & 0x1) != 0)
sb.append("Letter ");
if ((flags & 0x2) != 0)
sb.append("Number ");

if ((flags & 0xf0) == 0) {
sb.append(c);
} else {
// This is an expansion, it sorts as two or more characters (eg ß sorts near ss).
// The pos is an index into srt5.
c.expands = (flags >> 4) & 0xf;
expansion(sb, c.first, c.expands);
}

item.addText(sb.toString());
return c;
}

/**
* Read the sort position information. The format varies depending on the posLength parameter.
*
* @param item The display item - any bytes read are added to this.
* @param posLength The length of the position information (not the record length, just the
* part of it that encodes the positions).
* @param charValue The character that this applies to.
* @return A {@link CharPosition} structure containing the sort position weights.
*/
private CharPosition readCharPosition(DisplayItem item, int posLength, int charValue) {
CharPosition c = new CharPosition(charValue);

if (posLength == 2) {
int rec = reader.get2u();
item.setBytes2(rec);

c.first = rec & 0xff;
c.second = (rec >> 8) & 0xf;
c.third = (rec >> 12) & 0xf;
} else if (posLength == 3) {
int rec = reader.get3u();
item.setBytes3(rec);
c.first = rec & 0xff;
c.second = (rec >> 8) & 0xff;
c.third = (rec >> 16) & 0xff;
} else if (posLength == 4) {
int rec = reader.get4();
item.setBytes4(rec);

c.first = rec & 0xffff;
c.second = (rec >> 16) & 0xff;
c.third = (rec >> 24) & 0xff;
}
return c;
}

/**
* Some characters sort as if they were two separate characters (eg ß sorts like 'ss').
* @param sb Sort order descriptions are added to this buffer.
* @param pos Index into the expansions area.
* @param n The number of characters in the expansion.
*/
private void expansion(StringBuilder sb, int pos, int n) {
sb.append("Expansion: ");
for (int i = 0; i <= n; i++) {
if (pos + i == 0) {
sb.append(String.format("error: pos=%d n=%d, readpos=%x", pos, n,
reader.position()));
return;
}
CharPosition ch = expansions.get(pos + i - 1);
sb.append(ch);
if (i != n) {
sb.append(" & ");
}
}
}

private int toUnicode(int c) {
if (isUnicode)
return c;
ByteBuffer b = ByteBuffer.allocate(1);
b.put((byte) c);
b.flip();
try {
CharBuffer chars = decoder.decode(b);
return chars.charAt(0);
} catch (CharacterCodingException e) {
return -1;
}
}

/**
* Expansion table. Some characters sort as though they were two (or more) characters.
* This table is a list of sort positions that are referred to from the main character
* table. As such there is no particular pattern to the entries in the table.
*/
private void printSrt5() {
Displayer d = new Displayer(reader);
d.setTitle("SRT 5 (expansions)");

int reclen = srt5.getRecordSize();
reader.position(tableHeader.getStart() + srt5.getStart());
for (int i = 0; i < srt5.getNumberOfRecords(); i++) {
DisplayItem item = d.item();
CharPosition ch = readCharPosition(item, reclen, 0);

item.addText(ch.toString());

expansions.add(ch);
}
d.print(outStream);
}

/**
* This is used for multi-byte character sets.
*
* It is a list of pointers into srt8.
*
* Some slots are filled with 0xffffffff so you
* probably look up the high bytes in this table to get the block where you
* look up the low byte or something similar.
*/
private void printSrt7() {
if (srt7 == null)
return;

Displayer d = new Displayer(reader);
d.setTitle("SRT 7 (pointers to srt8 indexed by top part of char)");

reader.position(tableHeader.getStart() + srt7.getStart());

int block = 1;
for (int i = 0; i < srt7.getNumberOfRecords(); i++) {

DisplayItem item = d.intItem();
int val = item.getValue();
item.addText("%4x: Pointer to srt8 %x", block, val);

if (val >= 0)
offsetToBlock.put(val/srt8.getRecordSize(), block);

d.print(outStream);
d.setTitle(null);
block++;
}
}

private void printSrt8() {
if (srt8 == null)
return;

Displayer d = new Displayer(reader);
d.setTitle("SRT 8 (character table for multibyte characters)");

int reclen = srt8.getRecordSize();
reader.position(tableHeader.getStart() + srt8.getStart());
d.setSectStart(reader.position());
int block = 1;
for (int i = 0; i < srt8.getNumberOfRecords(); i++) {
Integer nblock = offsetToBlock.get(i);
if (nblock != null)
block = nblock;

CharPosition cp = printCharPosition(d, reclen, block*256 + (i % 256));
charmap.add(cp);

d.print(outStream);
d.setTitle(null);
}
}

public static void main(String[] args) {
if (args.length < 1) {
System.err.println("Usage: srtdisplay <filename>");
System.exit(1);
}

CommonDisplay td = new SrtDisplay();
td.display(args[0],"SRT");
}

private class CharPosition implements Comparable<CharPosition> {
private final int val;
private int first;
private int second;
private int third;
private int expands;

public CharPosition(int charValue) {
this.val = charValue;
}

public int compareTo(CharPosition c2) {
if (c2.first == first)
return compareSecond(c2);
else if (first < c2.first)
return -1;
else
return 1;
}

private int compareSecond(CharPosition c2) {
if (c2.second == second)
return compareThird(c2);
else if (second < c2.second)
return -1;
else
return 1;
}

private int compareThird(CharPosition c2) {
if (third == c2.third)
return Integer.compare(val, c2.val);
else if (third < c2.third)
return -1;
else
return 1;
}

public String toString() {
return "prim=" + first + ",sec=" + second + ",tert=" + third;
}
}
}

Subversion Repositories display

(root)/trunk/src/test/display/SrtDisplay.java - Rev 583