# unicode.tcl                                                   -*- tcl -*-
#
#	Implementation of RFC 3454 "Preparation of Internationalized Strings"
#
# Copyright (c) 2007 Sergei Golovan
#
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
# RCS: @(#) $Id: unicode.tcl,v 1.1 2008/01/29 02:18:10 patthoyts Exp $

package require unicode::data 1.0

namespace eval ::unicode {
    # Hangul constants
    set SBase 0xac00
    set LBase 0x1100
    set VBase 0x1161
    set TBase 0x11a7
    set LCount 19
    set VCount 21
    set TCount 28
    set NCount [expr {$VCount * $TCount}]
    set SCount [expr {$LCount * $NCount}]
}

########################################################################
# ::unicode::fromstring converts string to list of integers

proc ::unicode::fromstring {str} {
    set uclist {}
    foreach char [split $str ""] {
	lappend uclist [scan $char %c]
    }
    return $uclist
}

########################################################################
# ::unicode::tostring converts list of integers to string

proc ::unicode::tostring {uclist} {
    set res ""
    foreach num $uclist {
	append res [format %c $num]
    }
    return $res
}

########################################################################
# ::unicode::normalize normalizes list of integers according to
# http://unicode.org/reports/tr15/
# form is to be D, C, KD, or KC

proc ::unicode::normalize {form uclist} {
    switch -- $form {
	D  { return [normalizeD  $uclist] }
	C  { return [normalizeC  $uclist] }
	KD { return [normalizeKD $uclist] }
	KC { return [normalizeKC $uclist] }
	default {
	    return -code error \
		   "::unicode::normalize: Only D, C, KD and KC forms are\
		    allowed"
	}
    }
}

########################################################################
# ::unicode::normalizeS normalizes string according to
# http://unicode.org/reports/tr15/
# form is to be D, C, KD, or KC

proc ::unicode::normalizeS {form str} {
    switch -- $form {
	D  { return [tostring [normalizeD  [fromstring $str]]] }
	C  { return [tostring [normalizeC  [fromstring $str]]] }
	KD { return [tostring [normalizeKD [fromstring $str]]] }
	KC { return [tostring [normalizeKC [fromstring $str]]] }
	default {
	    return -code error \
		   "::unicode::normalizeS: Only D, C, KD and KC forms are\
		    allowed"
	}
    }
}

########################################################################

proc ::unicode::normalizeD {uclist} {
    set res {}
    foreach uc $uclist {
	set res [concat $res [decomposeCanonical $uc]]
    }

    canonicalOrdering $res
}

proc ::unicode::normalizeC {uclist} {
    composeCanonical [normalizeD $uclist]
}

proc ::unicode::normalizeKD {uclist} {
    set res {}
    foreach uc $uclist {
	set res [concat $res [decomposeCompat $uc]]
    }

    canonicalOrdering $res
}

proc ::unicode::normalizeKC {uclist} {
    composeCanonical [normalizeKD $uclist]
}

########################################################################
# Adjacent characters with nonzero character class should go in
# order of increasing character class

proc ::unicode::canonicalOrdering {uclist} {
    set res {}
    set slist {}
    foreach uc $uclist {
	set cclass [data::GetUniCharCClass $uc]
	if {$cclass != 0} {
	    lappend slist [list $uc $cclass]
	} else {
	    foreach s [lsort -integer -index 1 $slist] {
		lappend res [lindex $s 0]
	    }
	    set slist {}
	    lappend res $uc
	}
    }
    foreach s [lsort -integer -index 1 $slist] {
	lappend res [lindex $s 0]
    }

    return $res
}

########################################################################

proc ::unicode::decomposeHangul {uc} {
    variable SBase
    variable LBase
    variable VBase
    variable TBase
    variable LCount
    variable VCount
    variable TCount
    variable NCount
    variable SCount

    # Hangul decomposition is algorithmic
    set SIndex [expr {$uc - $SBase}]
    if {$SIndex >= 0 && $SIndex < $SCount} {
        set res {}
        set L [expr {$LBase + $SIndex / $NCount}]
        set V [expr {$VBase + ($SIndex % $NCount) / $TCount}]
        set T [expr {$TBase + $SIndex % $TCount}]
	set res [list $L $V]
        if {$T != $TBase} {
	    lappend res $T
	}
        return $res
    }
    return -1
}

########################################################################

proc ::unicode::decomposeCanonical {uc} {
    # Try to decompose Hangul first
    set res [decomposeHangul $uc]
    if {$res >= 0} {
	return $res
    }

    # For others do a lookup in data tables
    set info [data::GetUniCharDecompInfo $uc]
    if {$info >= 0} {
	set res {}
	foreach c [data::GetDecompList $info] {
	    set res [concat $res [decomposeCanonical $c]]
	}
	return $res
    } else {
	return [list $uc]
    }
}

########################################################################

proc ::unicode::decomposeCompat {uc} {
    # Try to decompose Hangul first
    set res [decomposeHangul $uc]
    if {$res >= 0} {
	return $res
    }

    # For others do a lookup in data tables
    set info [data::GetUniCharDecompCompatInfo $uc]
    if {$info >= 0} {
	set res {}
	foreach c [data::GetDecompList $info] {
	    set res [concat $res [decomposeCompat $c]]
	}
	return $res
    } else {
	return [list $uc]
    }
}

########################################################################

proc ::unicode::composeTwo {uc1 uc2} {
    variable SBase
    variable LBase
    variable VBase
    variable TBase
    variable LCount
    variable VCount
    variable TCount
    variable NCount
    variable SCount

    # Hangul composition is algorithmic
    if {$uc1 >= $LBase && $uc1 < $LBase + $LCount && \
	$uc2 >= $VBase && $uc2 < $VBase + $VCount} {
	return [expr {$SBase + (($uc1 - $LBase) * $VCount + \
		      ($uc2 - $VBase)) * $TCount}]
    }

    if {$uc1 >= $SBase && $uc1 < $SBase + $SCount && \
	(($uc1 - $SBase) % $TCount) == 0 && \
	$uc2 >= $TBase && $uc2 < $TBase + $TCount} {
	return [expr {$uc1 + $uc2 - $TBase}]
    }

    # For others do a lookup in data tables
    set info1 [data::GetUniCharCompInfo $uc1]
    set res [data::GetCompFirst $uc2 $info1]
    if {$res != -1} {
	return $res
    }

    set info2 [data::GetUniCharCompInfo $uc2]
    set res [data::GetCompSecond $uc1 $info2]
    if {$res != -1} {
	return $res
    }

    data::GetCompBoth $info1 $info2
}

########################################################################

proc ::unicode::composeCanonical {uclist} {
    if {[llength $uclist] == 0} {
	return {}
    }

    set res {}
    set comps {}
    set ch1 [lindex $uclist 0]
    set cclass_prev [data::GetUniCharCClass $ch1]
    foreach ch2 [lrange $uclist 1 end] {
	set cclass [data::GetUniCharCClass $ch2]
	if {($cclass_prev == 0 || $cclass > $cclass_prev) && \
		[set ruc [composeTwo $ch1 $ch2]]} {
	    set ch1 $ruc
	} else {
	    if {$cclass == 0} {
		lappend res $ch1
		set res [concat $res $comps]
		set comps {}
		set ch1 $ch2
		set cclass_prev 0
	    } else {
		lappend comps $ch2
		set cclass_prev $cclass
	    }
	}
    }
    lappend res $ch1
    concat $res $comps
}

########################################################################

package provide unicode 1.0.0