Rev 3294 |
Blame |
Compare with Previous |
Last modification |
View Log
| RSS feed
/*
* Copyright (C) 2014.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 3 or
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
package main;
import java.nio.charset.Charset;
import java.text.CollationKey;
import java.text.Collator;
import java.text.ParseException;
import java.text.RuleBasedCollator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import uk.me.parabola.imgfmt.app.srt.Sort;
import uk.me.parabola.imgfmt.app.srt.SortKey;
import uk.me.parabola.mkgmap.srt.SrtTextReader;
/**
* Test to compare sorting results and timings between sort keys and collator.
*
* Also have tested against java7 RuleBasedCollator and the ICU one.
*
* In general our implementation is fastest by a long way; key based sort 3 times faster, collation
* based sort even more so. The java collator does not result in the same sort as using sort keys.
*
* I also tried out the ICU collation with mixed results. Could not get the correct desired results with
* it. It was not faster than our implementation for a 1252 cp sort.
*/
public class SortTest
{
private static final int LIST_SIZE =
500000;
private Sort sort
;
private boolean time
;
private boolean fullOutput
;
private boolean quiet
;
private boolean unicode
;
private void test
() throws Exception {
sort = SrtTextReader.
sortForCodepage(unicode
? 65001:
1252);
//testPairs();
Charset charset = sort.
getCharset();
Random rand =
new Random(21909278L
);
List<String> list = createList
(rand, charset
);
if (time
) {
// Run a few times without output, to warm up
compareLists
(sortWithKeys
(list
), sortWithKeys
(list
));
compareLists
(sortWithCollator
(list
), sortWithCollator
(list
));
compareLists
(sortWithJavaKeys
(list
), sortWithJavaKeys
(list
));
compareLists
(sortWithJavaCollator
(list
), sortWithJavaCollator
(list
));
// re-create the list to make sure it wasn't too optimised to the data
list = createList
(rand, charset
);
}
System.
out.
println("Compare key sort and collator sort");
int n = compareLists
(sortWithKeys
(list
), sortWithCollator
(list
));
System.
out.
println("N errors " + n
);
if (!unicode
) {
System.
out.
println("Compare our sort with java sort");
n = compareLists
(sortWithKeys
(list
), sortWithJavaKeys
(list
));
System.
out.
println("N errors " + n
);
}
if (time
) {
System.
out.
println("Compare java keys with java collator");
n = compareLists
(sortWithJavaKeys
(list
), sortWithJavaCollator
(list
));
System.
out.
println("N errors " + n
);
}
}
private List<String> createList
(Random rand,
Charset charset
) {
List<String> list =
new ArrayList<>();
for (int n =
0; n
< LIST_SIZE
; n++
) {
int len = rand.
nextInt(6)+
1;
if (len
< 2)
len = rand.
nextInt(5) +
2;
if (unicode
) {
char[] c =
new char[len
];
for (int i =
0; i
< len
; i++
) {
int ch
;
do {
if (rand.
nextInt(10) > 6)
ch = rand.
nextInt(6 * 256);
else
ch = rand.
nextInt(256);
} while (reject
(rand, ch
));
c
[i
] =
(char) ch
;
}
list.
add(new String(c
));
} else {
byte[] b =
new byte[len
];
for (int i =
0; i
< len
; i++
) {
int ch
;
do {
ch = rand.
nextInt(256);
// reject unassigned. Also low chars most of the time
} while (reject
(rand, ch
));
b
[i
] =
(byte) ch
;
}
list.
add(new String(b, charset
));
}
}
list =
Collections.
unmodifiableList(list
);
return list
;
}
private int compareLists
(List<String> r1,
List<String> r2
) {
int count =
0;
for (int i =
0; i
< LIST_SIZE
; i++
) {
String s1 = r1.
get(i
);
String s2 = r2.
get(i
);
String mark =
"";
if (!s1.
equals(s2
)) {
mark =
"*";
count++
;
}
if (fullOutput ||
(!mark.
isEmpty() && !quiet
))
System.
out.
printf("%6d |%-10s |%-10s %s\n", i, s1, s2, mark
);
}
return count
;
}
private boolean reject
(Random rand,
int ch
) {
switch (ch
) {
case 0:
case ' ':
case '\n':
case '\r':
case 0x81:
case 0x8d:
case 0x8f:
case 0x90:
case 0x9d:
return true;
}
switch (Character.
getType(ch
)) {
case Character.
UNASSIGNED:
return true;
case Character.
CONTROL:
return true;
}
// Reject low characters most of the time
if (ch
< 0x20
&& rand.
nextInt(100) < 95)
return true;
if (ch
> 255 && rand.
nextInt(100) > 99)
return true;
return false;
}
private List<String> sortWithKeys
(List<String> list
) {
long start =
System.
currentTimeMillis();
List<SortKey<String>> keys =
new ArrayList<>();
for (String s : list
) {
SortKey<String> key = sort.
createSortKey(s, s
);
keys.
add(key
);
}
Collections.
sort(keys
);
long end =
System.
currentTimeMillis();
List<String> ret =
new ArrayList<>();
for (SortKey<String> key : keys
) {
ret.
add(key.
getObject());
}
System.
out.
println("time keys: " +
(end-start
) +
"ms");
return ret
;
}
private List<String> sortWithCollator
(List<String> list
) {
long start =
System.
currentTimeMillis();
List<String> ret =
new ArrayList<>(list
);
Collections.
sort(ret, sort.
getCollator());
System.
out.
println("time coll: " +
(System.
currentTimeMillis() - start
) +
"ms");
return ret
;
}
private List<String> sortWithJavaKeys
(List<String> list
) {
long start =
System.
currentTimeMillis();
List<CollationKey> keys =
new ArrayList<>();
Collator jcol
;
try {
jcol =
new RuleBasedCollator(getRules
(false));
} catch (ParseException e
) {
e.
printStackTrace();
return null;
}
for (String s : list
) {
CollationKey key = jcol.
getCollationKey(s
);
keys.
add(key
);
}
Collections.
sort(keys
);
long end =
System.
currentTimeMillis();
List<String> ret =
new ArrayList<>();
for (CollationKey key : keys
) {
ret.
add(key.
getSourceString());
}
System.
out.
println("time J keys: " +
(end - start
) +
"ms");
return ret
;
}
private List<String> sortWithJavaCollator
(List<String> list
) {
long start =
System.
currentTimeMillis();
List<String> out =
new ArrayList<>(list
);
Collator jcol
;
try {
jcol =
new RuleBasedCollator(getRules
(false));
jcol.
setStrength(Collator.
TERTIARY);
} catch (ParseException e
) {
e.
printStackTrace();
return null;
}
Collections.
sort(out, jcol
);
System.
out.
println("time J collator: " +
(System.
currentTimeMillis() - start
) +
"ms");
return out
;
}
private String getRules
(boolean forICU
) {
return "='\u0008'='\u000e'='\u000f'='\u0010'='\u0011'='\u0012'='\u0013'='\u0014'='\u0015'='\u0016'"
+
"='\u0017' ='\u0018' = '\u0019' ='\u001a' ='\u001b'= '\u001c' ='\u001d'= '\u001e'= '\u001f' "
+
"='\u007f' ='\u00ad'"
+
", '\u0001', '\u0002', '\u0003', '\u0004' ,'\u0005' ,'\u0006', '\u0007'"
+
"< '\u0009' < '\n' < '\u000b' < '\u000c' < '\r' < '\u0020','\u00a0'"
+
"< '_' < '-' < '–' < '—' < '\u002c' < '\u003b' < ':' < '!' < '¡' < '?' < '¿'"
+
"< '.' < '·' "
+
((forICU
)? "< \\' ":
"< ''' ")
+
"< '‘' < '’' < '‚' < '‹' < '›' < '“' < '”' < '„' < '«' < '»' "
+
" < '\"' "
+
"< '“' < '”' < '„' < '«'< '»' < '(' < ')' "
+
"< '[' < ']' < '{' < '}' < '§' < '¶' < '@' < '*' < '/' < '\\' < '&' < '#' < '%'"
+
"< '‰' < '†' < '‡' < '•' < '`' < '´' < '^' < '¯' < '¨' < '¸' < 'ˆ' < '°' < '©' < '®'"
+
"< '+' < '±' < '÷' < '×' < '\u003c' < '\u003d' < '>' < '¬' < '|' < '¦' < '~' ; '˜' < '¤'"
+
"< '¢' < '$' < '£' < '¥' < '€' < 0 < 1,¹ < 2,² < 3,³ < 4 < 5 < 6 < 7 < 8 < 9"
+
"< a,ª,A ; á,Á ; à,À ; â, ; å,Å ; ä,Ä ; ã,Ã"
+
"< b,B"
+
"< c,C ; ç,Ç"
+
"< d,D ; ð,Ð"
+
"< e,E ; é,É ; è,È ; ê,Ê ; ë,Ë"
+
"< f,F"
+
"< ƒ"
+
"< g,G"
+
"< h,H"
+
"< i,I ; í,Í ; ì,Ì ; î,Î ; ï,Ï"
+
"< j,J"
+
"< k,K"
+
"< l,L"
+
"< m,M"
+
"< n,N ; ñ,Ñ"
+
"< o,º,O ; ó,Ó ; ò,Ò ; ô,Ô ; ö,Ö ; õ,Õ ; ø,Ø"
+
"< p,P"
+
"< q,Q"
+
"< r,R"
+
"< s,S ; š,Š"
+
"< t,T"
+
"< u,U ; ú,Ú ; ù,Ù ; û,Û ; ü,Ü"
+
"< v,V"
+
"< w,W"
+
"< x,X"
+
"< y,Y ; ý,Ý ; ÿ,Ÿ"
+
"< z,Z ; ž,Ž"
+
"< þ,Þ"
+
"< µ"
+
"&'1/4'=¼ &'1/2'=½ &'3/4'=¾"
+
"&ae = æ &AE = Æ &ss = ß &OE= Œ &oe= œ &TM = ™ &'...' = … "
;
}
public static void main
(String[] args
) throws Exception {
SortTest sortTest =
new SortTest
();
for (String arg : args
) {
switch (arg
) {
case "--time":
sortTest.
time =
true;
break;
case "--full":
sortTest.
fullOutput =
true;
break;
case "--quiet":
sortTest.
quiet =
true;
break;
case "--unicode":
sortTest.
unicode =
true;
break;
}
}
sortTest.
test();
}
}