Rev 4399 |
Blame |
Compare with Previous |
Last modification |
View Log
| RSS feed
/*
* Copyright (C) 2010.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 3 or
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
package uk.me.parabola.imgfmt.app.labelenc;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Locale;
import uk.me.parabola.log.Logger;
/**
* A simple transliterator that transliterates character by character based
* on pre-prepared tables. It is not context sensitive - the same input character
* always produces the same output character(s), so the results are
* not very good for languages where that is important.
*
* Tables are only read when needed, so for a typical map only a small
* number of files will actually be read.
*/
public class TableTransliterator
implements Transliterator
{
private static final Logger log =
Logger.
getLogger(TableTransliterator.
class);
private final String[][] rows =
new String[256][];
private final boolean useLatin
;
private boolean forceUppercase
;
public TableTransliterator
(String targetCharset
) {
if ("latin1".
equals(targetCharset
) ||
"cp1252".
equals(targetCharset
))
useLatin =
true;
else
useLatin =
false;
}
/**
* Convert a string into a string that uses only ascii characters.
*
* @param s The original string. It can use any unicode character. Can be null in which case null will
* be returned.
* @return A string that uses only ascii characters that is a transcription or
* transliteration of the original string.
*/
public String transliterate
(String s
) {
if (s ==
null)
return null;
StringBuilder sb =
new StringBuilder(s.
length() +
5);
for (char c : s.
toCharArray()) {
if (c
<=
(useLatin
? 0xff: 0x7f
)) {
sb.
append(c
);
} else {
int row = c
>>> 8;
String[] rowmap = rows
[row
];
if (rowmap ==
null)
rowmap = loadRow
(row
);
sb.
append(rowmap
[c
& 0xff
]);
}
}
String text = sb.
toString();
if (forceUppercase
)
text = text.
toUpperCase(Locale.
ENGLISH);
return text
;
}
public void forceUppercase
(boolean uc
) {
forceUppercase = uc
;
}
/**
* Load one row of characters. This means unicode characters that are of the
* form U+RRXX where RR is the row.
*
* @param row Row number 0-255.
* @return An array of strings, one for each character in the row. If there is
* no ascii representation then a '?' character will fill that
* position.
*/
private String[] loadRow
(int row
) {
if (rows
[row
] !=
null)
return rows
[row
];
String[] newRow =
new String[256];
rows
[row
] = newRow
;
// Default all to a question mark
Arrays.
fill(newRow,
"?");
// If we are doing latin1, see if there is a specific file for latin
// characters first.
if (useLatin
) {
String name =
String.
format("/chars/latin1/row%02x.trans", row
);
readCharFile
(name, newRow
);
}
// Fill in any remaining characters from the ascii mappings.
String name =
String.
format("/chars/ascii/row%02x.trans", row
);
readCharFile
(name, newRow
);
return newRow
;
}
private void readCharFile
(String name,
String[] newRow
) {
try (InputStream is = getClass
().
getResourceAsStream(name
)) {
if (is ==
null)
return;
BufferedReader br =
new BufferedReader(new InputStreamReader(is, StandardCharsets.
UTF_8));
String line
;
while ((line = br.
readLine()) !=
null) {
line = line.
trim();
if (line.
isEmpty() || line.
charAt(0) ==
'#')
continue;
String[] fields = line.
split("\\s+");
if (fields.
length < 2)
continue;
String upoint = fields
[0];
String translation = fields
[1];
if ("?".
equals(translation
)) continue;
if (upoint.
length() !=
6 || upoint.
charAt(0) !=
'U') continue;
// The first field must look like 'U+RRXX', we extract the XX part
int index =
Integer.
parseInt(upoint.
substring(4),
16);
if ("?".
equals(newRow
[index
])) {
if (forceUppercase
)
newRow
[index
] = translation.
toUpperCase(Locale.
ENGLISH);
else
newRow
[index
] = translation
;
}
}
} catch (IOException e
) {
log.
error("Could not read character translation table");
}
}
}