Subversion Repositories mkgmap

Rev

Rev 3408 | Blame | Compare with Previous | Last modification | View Log | RSS feed

/*
 * Copyright (C) 2010.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3 or
 * version 2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 */


package uk.me.parabola.imgfmt.app.labelenc;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.Locale;

import uk.me.parabola.log.Logger;

/**
 * A simple transliterator that transliterates character by character based
 * on pre-prepared tables.  It is not context sensitive - the same input character
 * always produces the same output character(s), so the results are
 * not very good for languages where that is important.
 *
 * Tables are only read when needed, so for a typical map only a small
 * number of files will actually be read.
 */

public class TableTransliterator implements Transliterator {
        private static final Logger log = Logger.getLogger(TableTransliterator.class);

        private final String[][] rows = new String[256][];
        private final boolean useLatin;
        private boolean forceUppercase;

        public TableTransliterator(String targetCharset) {
                if (targetCharset.equals("latin1") || targetCharset.equals("cp1252"))
                        useLatin = true;
                else
                        useLatin = false;
        }

        /**
         * Convert a string into a string that uses only ascii characters.
         *
         * @param s The original string.  It can use any unicode character. Can be null in which case null will
         * be returned.
         * @return A string that uses only ascii characters that is a transcription or
         *         transliteration of the original string.
         */

        public String transliterate(String s) {
                if (s == null)
                        return null;

                StringBuilder sb = new StringBuilder(s.length() + 5);
                for (char c : s.toCharArray()) {
                        if (c <= (useLatin? 0xff: 0x7f)) {
                                sb.append(c);
                        } else {
                                int row = c >>> 8;

                                String[] rowmap = rows[row];
                                if (rowmap == null)
                                        rowmap = loadRow(row);
                                sb.append(rowmap[c & 0xff]);
                        }
                }

                String text = sb.toString();
                if (forceUppercase)
                        text = text.toUpperCase(Locale.ENGLISH);
                return text;
        }

        public void forceUppercase(boolean uc) {
                forceUppercase = uc;
        }

        /**
         * Load one row of characters.  This means unicode characters that are of the
         * form U+RRXX where RR is the row.
         *
         * @param row Row number 0-255.
         * @return An array of strings, one for each character in the row.  If there is
         *         no ascii representation then a '?' character will fill that
         *         position.
         */

        private String[] loadRow(int row) {
                if (rows[row] != null)
                        return rows[row];

                String[] newRow = new String[256];
                rows[row] = newRow;

                // Default all to a question mark
                Arrays.fill(newRow, "?");

                // If we are doing latin1, see if there is a specific file for latin
                // characters first.
                if (useLatin) {
                        String name = String.format("/chars/latin1/row%02x.trans", row);
                        readCharFile(name, newRow);
                }

                // Fill in any remaining characters from the ascii mappings.
                String name = String.format("/chars/ascii/row%02x.trans", row);
                readCharFile(name, newRow);

                return newRow;
        }

        private void readCharFile(String name, String[] newRow) {
                InputStream is = getClass().getResourceAsStream(name);
                if (is == null)
                        return;

                try {
                        BufferedReader br = new BufferedReader(new InputStreamReader(is, "utf-8"));

                        String line;
                        while ((line = br.readLine()) != null) {
                                line = line.trim();
                                if (line.isEmpty() || line.charAt(0) == '#')
                                        continue;

                                String[] fields = line.split("\\s+");
                                if (fields.length < 2)
                                        continue;
                               
                                String upoint = fields[0];
                                String translation = fields[1];

                                if ("?".equals(translation)) continue;
                                if (upoint.length() != 6 || upoint.charAt(0) != 'U') continue;

                                // The first field must look like 'U+RRXX', we extract the XX part
                                int index = Integer.parseInt(upoint.substring(4), 16);
                                if (newRow[index].equals("?")) {
                                        if (forceUppercase)
                                                newRow[index] = translation.toUpperCase(Locale.ENGLISH);
                                        else
                                                newRow[index] = translation;
                                }
                        }
                } catch (IOException e) {
                        log.error("Could not read character translation table");
                }
        }

}