#!/usr/bin/tclsh # gen_unicode_test.tcl -- # # This program parses the RFC 3454 file and generates the # corresponding unicode.test file with unicode package tests. # The input to this program should be NormalizationTest.txt. # It can be downloaded from: # ftp://ftp.unicode.org/Public/UNIDATA/NormalizationTest.txt # Short test suite is generated by default. If you want to generate # all tests (more than 300000 test cases) add suffix 'full' as the # third argument. # # Usage: gen_unicode_test.tcl infile outdir ?full? # # RCS: @(#) $Id: gen_unicode_test.tcl,v 1.1 2008/01/29 02:18:10 patthoyts Exp $ package require struct::list set short_test_list [list \ "LATIN CAPITAL LETTER D, COMBINING DOT ABOVE, COMBINING DOT BELOW" \ "NO-BREAK SPACE" \ "VULGAR FRACTION ONE HALF" \ "ORIYA LETTER RRA" \ "KANNADA VOWEL SIGN EE" \ "TIBETAN LETTER GHA" \ "MODIFIER LETTER CAPITAL A" \ "GREEK SMALL LETTER EPSILON WITH PSILI AND OXIA" \ "KANGXI RADICAL SPROUT" \ "HIRAGANA LETTER DE" \ "KATAKANA LETTER PA" \ "HANGUL LETTER SIOS-PIEUP" \ "HANGUL SYLLABLE GYANG" \ "CJK COMPATIBILITY IDEOGRAPH-F98E" \ "ARABIC LETTER HEH DOACHASHMEE ISOLATED FORM" \ "ARABIC LIGATURE AIN WITH JEEM ISOLATED FORM" \ "FULLWIDTH DIGIT THREE" \ "LATIN SMALL LETTER A, COMBINING CYRILLIC TITLO, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT, HEBREW ACCENT ZINOR, LATIN SMALL LETTER B" \ "LATIN SMALL LETTER A, DEVANAGARI SIGN NUKTA, COMBINING TILDE OVERLAY, COMBINING RING OVERLAY, LATIN SMALL LETTER B" \ "HANGUL SYLLABLE BYO, COMBINING TILDE OVERLAY, HANGUL JONGSEONG TIKEUT"] set fd [open [lindex $argv 0]] set all_tests {} set n 0 while {[gets $fd line] >= 0} { set line [string trim $line] if {![regexp \ {^([ [:xdigit:]]+);([ [:xdigit:]]+);([ [:xdigit:]]+);([ [:xdigit:]]+);([ [:xdigit:]]+);.*\) (.*)} \ $line -> c(1) c(2) c(3) c(4) c(5) title]} continue set q 1 foreach i {1 2 3 4 5} { set s($i) {} set us($i) "" foreach xnum $c($i) { set uc [scan $xnum %x] if {$uc > 0xffff} { set q 0 } lappend s($i) $uc append us($i) \\u$xnum } } if {!$q} { # Test case contains character which is greater than 0xFFFF and can't # be represented in Tcl continue } set test($n) [list $s(1) $s(2) $s(3) $s(4) $s(5) $title] set test1($n) [list $us(1) $us(2) $us(3) $us(4) $us(5) $title] if {[lsearch $short_test_list $title] >= 0} { lappend all_tests $n } incr n } close $fd if {[string equal [lindex $argv 2] full]} { set all_tests [struct::list iota $n] } set f [open [file join [lindex $argv 1] unicode.test] w] fconfigure $f -translation lf puts $f \ "# unicode.test # # Tests for the unicode package. This file is automatically generated by # the gen_unicode_test.tcl script. Do not modify this file by hands. # # RCS: @(#) \$Id\$ # ------------------------------------------------------------------------- source \[file join \\ \[file dirname \[file dirname \[file join \[pwd\] \[info script\]\]\]\] \\ devtools testutilities.tcl\] testsNeedTcl 8.3 testsNeedTcltest 1.0 testing { useLocalFile unicode_data.tcl useLocalFile unicode.tcl } # ------------------------------------------------------------------------- " set j 0 foreach i $all_tests { puts $f \ " test unicode-1.[incr j] {normalizeS D: [lindex $test1($i) 5]} { unicode::normalizeS D \"[lindex $test1($i) 0]\" } \"[lindex $test1($i) 2]\" test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} { unicode::normalize D [list [lindex $test($i) 1]] } {[lindex $test($i) 2]} test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} { unicode::normalize D [list [lindex $test($i) 2]] } {[lindex $test($i) 2]} test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} { unicode::normalize D [list [lindex $test($i) 3]] } {[lindex $test($i) 4]} test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} { unicode::normalize D [list [lindex $test($i) 4]] } {[lindex $test($i) 4]} " } set j 0 foreach i $all_tests { puts $f \ " test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} { unicode::normalize C [list [lindex $test($i) 0]] } {[lindex $test($i) 1]} test unicode-2.[incr j] {normalizeS C: [lindex $test1($i) 5]} { unicode::normalizeS C \"[lindex $test1($i) 1]\" } \"[lindex $test1($i) 1]\" test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} { unicode::normalize C [list [lindex $test($i) 2]] } {[lindex $test($i) 1]} test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} { unicode::normalize C [list [lindex $test($i) 3]] } {[lindex $test($i) 3]} test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} { unicode::normalize C [list [lindex $test($i) 4]] } {[lindex $test($i) 3]} " } set j 0 foreach i $all_tests { puts $f \ " test unicode-3.[incr j] {normalize KD: [lindex $test($i) 5]} { unicode::normalize KD [list [lindex $test($i) 0]] } {[lindex $test($i) 4]} test unicode-3.[incr j] {normalize KD: [lindex $test($i) 5]} { unicode::normalize KD [list [lindex $test($i) 1]] } {[lindex $test($i) 4]} test unicode-3.[incr j] {normalizeS KD: [lindex $test1($i) 5]} { unicode::normalizeS KD \"[lindex $test1($i) 2]\" } \"[lindex $test1($i) 4]\" test unicode-3.[incr j] {normalize KD: [lindex $test($i) 5]} { unicode::normalize KD [list [lindex $test($i) 3]] } {[lindex $test($i) 4]} test unicode-1.[incr j] {normalize KD: [lindex $test($i) 5]} { unicode::normalize KD [list [lindex $test($i) 4]] } {[lindex $test($i) 4]} " } set j 0 foreach i $all_tests { puts $f \ " test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} { unicode::normalize KC [list [lindex $test($i) 0]] } {[lindex $test($i) 3]} test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} { unicode::normalize KC [list [lindex $test($i) 1]] } {[lindex $test($i) 3]} test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} { unicode::normalize KC [list [lindex $test($i) 2]] } {[lindex $test($i) 3]} test unicode-4.[incr j] {normalizeS KC: [lindex $test1($i) 5]} { unicode::normalizeS KC \"[lindex $test1($i) 3]\" } \"[lindex $test1($i) 3]\" test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} { unicode::normalize KC [list [lindex $test($i) 4]] } {[lindex $test($i) 3]} " } puts $f \ " test unicode-5.1 {fromstring} { unicode::fromstring \"\\u0403\\u0405\\u0406\\u041f\\u0034\" } {1027 1029 1030 1055 52} test unicode-5.2 {fromstring} { unicode::fromstring \"\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\u0008\\u0009\\u000a\\u000b\\u000c\\u000d\" } {1 2 3 4 5 6 7 8 9 10 11 12 13} test unicode-6.1 {tostring} { unicode::tostring {16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1} } \"\\u0010\\u000f\\u000e\\u000d\\u000c\\u000b\\u000a\\u0009\\u0008\\u0007\\u0006\\u0005\\u0004\\u0003\\u0002\\u0001\" test unicode-6.2 {tostring} { unicode::tostring {12345 12346 12347 12348 12349 12350 12351} } \"\\u3039\\u303a\\u303b\\u303c\\u303d\\u303e\\u303f\" test unicode-7.1 {normalize bad form} { catch {unicode::normalize S \"\"} result set result } \"::unicode::normalize: Only D, C, KD and KC forms are allowed\" test unicode-8.1 {normalizeS bad form} { catch {unicode::normalizeS S \"\"} result set result } \"::unicode::normalizeS: Only D, C, KD and KC forms are allowed\" ::tcltest::cleanupTests " close $f