# -*- tcl -*- # Copyright (c) 2009 Andreas Kupries # Recursive descent parser for Tcl commands embedded in a string. (=> # subst -novariables, without actual evaluation of the embedded # commands). Useful for processing templates, etc. The result is an # abstract syntax tree of strings and commands, which in turn have # strings and commands as arguments. # The tree can be processed further. The nodes of the tree are # annotated with line/column/offset information to allow later stages # the reporting of higher-level syntax and semantic errors with exact # locations in the input. # TODO :: Add ability to report progress through the # TODO :: input. Callback. Invoked in 'Initialize', 'Step', and # TODO :: 'Finalize'. # TODO :: Investigate possibility of using tclparser package # TODO :: ('parser') to handle the command pieces embedded in the # TODO :: text. # # ## ### ##### ######## ############# ##################### ## Requirements package require Tcl 8.4 ; # Required runtime. package require snit ; # OO system. package require fileutil ; # File utilities. package require logger ; # User feedback. package require struct::list ; # Higher-order list operations. package require struct::stack ; # Stacks package require struct::set ; # Finite sets package require treeql ; # Tree queries and transformation. # # ## ### ##### ######## ############# ##################### ## logger::initNamespace ::doctools::tcl::parse snit::type ::doctools::tcl::parse { # # ## ### ##### ######## ############# ## Public API typemethod file {t path {root {}}} { $type text $t [fileutil::cat -translation binary -encoding binary $path] $root } typemethod text {t text {root {}}} { # --- --- --- --------- --------- --------- # Phase 1. Lexical processing. # The resulting tree contains the raw tokens. See # below for the specification of the resulting tree # structure. # # This part is a recursive descent parser using Tcl's 12 rules # for processing the input. Note: Variable references are not # recognized, they are processed like regular text. Initialize $t $text $root String Finalize # Tree structure # - All nodes but the root have the attributes 'type', 'range', 'line', and 'col'. # # * 'type' in { Command, Text, Backslash, Word, Quote, Continuation, QBrace } # * 'range' is 2-element list (offset start, offset end) # * 'line' is integer number > 0 # * 'col' is integer number >= 0 # # 'type' specifies what sort of token the node contains. # # 'range' is the location of the token as offsets in # characters from the beginning of the string, for # first and last character in the token. EOL markers # count as one character. This can be empty. # # 'line', 'col' are the location of the first character # AFTER the token, as the line and column the character is # on and at. # # Meaning of the various node types # # Command .... : A command begins here, the text in the range # .............. is the opening bracket. # Text ....... : A text segment in a word, anything up to the # .............. beginning of a backslash sequence or of an # .............. embedded command. # Backslash .. : A backslash sequence. The text under the # .............. range is the whole sequence. # Word ....... : The beginning of an unquoted, quoted or # .............. braced word. The text under the range is the # .............. opening quote or brace, if any. The range is # .............. empty for an unquoted word. # Quote ...... : An embedded double-quote character which is # .............. not the end of a quoted string (a special # .............. type of backslash sequence). The range is the # .............. whole sequence. # Continuation : A continuation line in an unquoted, quoted, # .............. or braced string. The range covers the whole # .............. sequence, including the whitespace trailing # .............. it. # QBrace ..... : A quoted brace in a braced string. A special # .............. kind of backslash sequence. The range covers # .............. the whole sequence. # --- --- --- --------- --------- --------- # Phase 2. Convert the token tree into a syntax tree. # This phase simplifies the tree by converting and # eliminating special tokens, and further decouples # it from the input by storing the relevant string # ranges of the input in the tree. For the the # specification of the resulting structure see method # 'Verify'. # # The sub-phases are and do # # (a) Extract the string information from the input and store # them in their Text tokens. # (b) Convert the special tokens (QBrace, Backslash, Quote, # Continuation) into equivalent 'Text' tokens, with proper # string information. # (c) Merge adjacent 'Text' tokens. # (d) Remove irrelevant 'Word' tokens. These are tokens with a # single Text token as child. Word tokens without children # however represent empty strings. They are converted into # an equivalent Text node instead. # (e) Pull the first word of commands into the command token, # and ensure that it is not dynamic, i.e not an embedded # command. ShowTree $t "Raw tree" set q [treeql %AUTO% -tree $t] # (a) foreach n [$q query tree withatt type Text] { struct::list assign [$t get $n range] a e #$t unset $n range $t set $n text [string range $mydata $a $e] } ShowTree $t "Text annotation" # (b1) foreach n [$q query tree withatt type QBrace] { struct::list assign [$t get $n range] a e incr a ; # Skip backslash #$t unset $n range $t set $n text [string range $mydata $a $e] $t set $n type Text } ShowTree $t "Special conversion 1, quoted braces" # (b2) foreach n [$q query tree withatt type Backslash] { struct::list assign [$t get $n range] a e #$t unset $n range $t set $n text [subst -nocommands -novariables [string range $mydata $a $e]] #puts <'[string range $mydata $a $e]'> #puts _'[subst -nocommands -novariables [string range $mydata $a $e]]'_ $t set $n type Text } ShowTree $t "Special conversion 2, backslash sequences" # (b3) foreach n [$q query tree withatt type Quote] { #$t unset $n range $t set $n text "\"" $t set $n type Text } ShowTree $t "Special conversion 3, quoted double quotes" # (b4) foreach n [$q query tree withatt type Continuation] { #$t unset $n range $t set $n text { } $t set $n type Text } ShowTree $t "Special conversion 4, continuation lines" # (c) foreach n [$q query tree withatt type Text right withatt type Text] { set left [$t previous $n] $t append $left text [$t get $n text] # Extend covered range. Copy location. struct::list assign [$t get $left range] a _ struct::list assign [$t get $n range] _ e $t set $left range [list $a $e] $t set $left line [$t get $n line] $t set $left col [$t get $n col] $t delete $n } ShowTree $t "Merged adjacent texts" # (d) foreach n [$q query tree withatt type Word] { if {![$t numchildren $n]} { $t set $n type Text $t set $n text {} } elseif {[$t numchildren $n] == 1} { $t cut $n } } ShowTree $t "Dropped simple words" # (e) foreach n [$q query tree withatt type Command] { set first [lindex [$t children $n] 0] if {[$t get $first type] eq "Word"} { error {Dynamic command name} } $t set $n text [$t get $first text] $t set $n range [$t get $first range] $t set $n line [$t get $first line] $t set $n col [$t get $first col] $t delete $first } ShowTree $t "Command lifting" $q destroy Verify $t return } proc Verify {t} { # Tree structure ... # Attributes Values # - type string in {'Command','Text','Word'} (phase 2) # - range 2-tuple (integer, integer), can be empty. start and end offset of the word in the input string. # - line integer, line the node starts on. First line is 1 # - col integer, column the node starts on (#char since start of line, first char is 0) # Constraints # .(i) The root node has no attributes at all. # .(ii) The children of the root are Command and Text nodes in semi-alternation. # I.e.: After a Text node a Command has to follow. # After a Command node either Text or Command can follow. # .(iii) The children of a Command node are Text, Word, and Command nodes, the command arguments. If any. # .(iv) The children of a Word node are Command and Text nodes in semi-alternation. # .(v) All Text nodes are leafs. # .(vi) Any Command node can be a leaf. # .(vii) Word nodes cannot be leafs. # .(viii) All non-root nodes have the attributes 'type', 'range', 'col', and 'line'. foreach n [$t nodes] { if {[$t parent $n] eq ""} { # (ii) set last {} foreach c [$t children $n] { set type [$t get $c type] if {![struct::set contains {Command Text} $type]} { return -code error "$c :: Bad node type $type in child of root node" } elseif {($type eq $last) && ($last eq "Text")} { return -code error "$c :: Bad node $type, not semi-alternating" } set last $type } # (i) if {[llength [$t getall $n]]} { return -code error "$n :: Bad root node, has attributes, should not" } continue } else { # (viii) foreach k {range line col} { if {![$t keyexists $n $k]} { return -code error "$n :: Bad node, attribute '$k' missing" } } } set type [$t get $n type] switch -exact -- $type { Command { # (vi) # No need to check children. May have some or not, # and no specific sequence is required. } Word { # (vii) if {![llength [$t children $n]]} { return -code error "$n :: Bad word node is leaf" } # (iv) set last {} foreach c [$t children $n] { set type [$t get $c type] if {![struct::set contains {Command Text} $type]} { return -code error "$n :: Bad node type $type in word node" } elseif {($type eq $last) && ($last eq "Text")} { return -code error "$c :: Bad node $type, not semi-alternating" } set last $type } } Text { # (v) if {[llength [$t children $n]]} { return -code error "$n :: Bad text node is not leaf" } } default { # (iii) return -code error "$n :: Bad node type $type" } } } return } # # ## ### ##### ######## ############# ## Internal methods, lexical processing proc String {} { while 1 { Note @String if {[EOF]} break if {[Command]} continue if {[TextSegment]} continue if {[Backslash]} continue Stop ;# Unexpected character } Note @EOF return } proc Command {} { # A command starts with an opening bracket. Note ?Command if {![Match "\\A(\\\[)" range]} { Note \t%No-Command return 0 } Note !Command PushRoot [Node Command $range] while {[Word]} { # Step over any whitespace after the last word Whitespace # Command ends at the closing bracket if {[Match "\\A(\\])" range]} break if {![EOF]} continue Stop ;# Unexpected end of input } Note !CommandStop PopRoot return 1 } proc TextSegment {} { # A text segment is anything up to a command start or start of # a back slash sequence. Note ?TextSegment if {![Match "\\A(\[^\\\[\]+)" range]} { Note \t%No-TextSegment return 0 } Note !TextSegment Node Text $range return 1 } proc TextSegmentWithoutQuote {} { Note ?TextSegmentWithoutQuote # A text segment without quote is anything up to a command # start or start of a back slash sequence, or a double-quote # character. if {![Match "\\A(\[^\"\\\\\[\]+)" range]} { Note \t%No-TextSegmentWithoutQuote return 0 } Note !TextSegment Node Text $range return 1 } proc Backslash {} { Note ?Backslash if { ![Match "\\A(\\\\x\[a-fA-F0-9\]+)" range] && ![Match "\\A(\\\\u\[a-fA-F0-9\]{1,4})" range] && ![Match "\\A(\\\\\[0-2\]\[0-7\]{2})" range] && ![Match "\\A(\\\\\[0-7\]{1,2})" range] && ![Match {\A(\\[abfnrtv])} range] } { Note \t%No-Backslash return 0 } Note !Backslash Node Backslash $range return 1 } proc Word {} { Note ?Word if {[QuotedWord]} {return 1} if {[BracedWord 0]} {return 1} return [UnquotedWord] } proc Whitespace {} { Note ?Whitespace if {![Match {\A([ \t]|(\\\n[ \t]*))+} range]} { Note \t%No-Whitespace return 0 } Note !Whitespace return 1 } proc QuotedWord {} { # A quoted word starts with a double quote. Note ?QuotedWord if {![Match "\\A(\")" range]} { Note \t%No-QuotedWord return 0 } Note !QuotedWord PushRoot [Node Word $range] QuotedString PopRoot return 1 } proc BracedWord {keepclose} { # A braced word starts with an opening brace. Note ?BracedWord/$keepclose if {![Match "\\A(\{)" range]} { Note \t%No-BracedWord/$keepclose return 0 } Note !BracedWord/$keepclose PushRoot [Node Word $range] BracedString $keepclose PopRoot return 1 } proc UnquotedWord {} { Note !UnquotedWord PushRoot [Node Word {}] UnquotedString PopRoot return 1 } proc QuotedString {} { Note !QuotedString while 1 { Note !QuotedStringPart # A quoted word (and thus the embedded string) ends with # double quote. if {[Match "\\A(\")" range]} { return } # Now try to match possible pieces of the string. This is # a repetition of the code in 'String', except for the # different end condition above, and the possible embedded # double quotes and continuation lines the outer string # can ignore. if {[Command]} continue if {[Quote]} continue if {[QuotedBraces]} continue if {[Continuation]} continue if {[Backslash]} continue # Check after backslash recognition and processing if {[TextSegmentWithoutQuote]} continue Stop ;# Unexpected character or end of input } return } proc BracedString {keepclose} { while 1 { Note !BracedStringPart # Closing brace encountered. keepclose is set if we are in # a nested braced string. Only then do we have to put the # brace as a regular text piece into the string if {[Match "\\A(\})" range]} { if {$keepclose} { Node Text $range } return } # Special sequences. if {[QuotedBraces]} continue if {[Continuation]} continue if {[BracedWord 1]} continue # A backslash without a brace coming after is regular a # character. if {[Match {\A(\\)} range]} { Node Text $range continue } # Gooble sequence of regular characters. Stops at # backslash and braces. Backslash stop is needed to handle # the case of them starting a quoted brace. if {[Match {\A([^\\\{\}]*)} range]} { Node Text $range continue } Stop ;# Unexpected character or end of input. } } proc UnquotedString {} { while 1 { Note !UnquotedStringPart # Stop conditions # - end of string # - whitespace # - Closing bracket (end of command the word is in) if {[EOF]} return if {[Whitespace]} return if {[Peek "\\A(\\\])" range]} return # Match each possible type of part if {[Command]} continue if {[Quote]} continue if {[Continuation]} continue if {[Backslash]} continue # Last, capture backslash sequences first. if {[UnquotedTextSegment]} continue Stop ;# Unexpected character or end of input. } return } proc UnquotedTextSegment {} { # All chars but whitespace and brackets (start or end of # command). Note ?UnquotedTextSegment if {![Match {\A([^\]\[\t\n ]+)} range]} { Note \t%No-UnquotedTextSegment return 0 } Note !UnquotedTextSegment Node Text $range return 1 } proc Quote {} { Note ?EmdeddedQuote if {![Match "\\A(\\\")" range]} { Note \t%No-EmdeddedQuote return 0 } # Embedded double quote, not the end of the quoted string. Note !EmdeddedQuote Node Quote $range return 1 } proc Continuation {} { Note ?ContinuationLine if {![Match "\\A(\\\\\n\[ \t\]*)" range]} { Note \t%No-ContinuationLine return 0 } Note !ContinuationLine Node Continuation $range return 1 } proc QuotedBraces {} { Note ?QuotedBrace if { ![Match "\\A(\\\\\{)" range] && ![Match "\\A(\\\\\})" range] } { Note \t%No-QuotedBrace return 0 } Note !QuotedBrace Node QBrace $range return 1 } # # ## ### ##### ######## ############# ## Tree construction helper commands. proc Node {what range} { set n [lindex [$mytree insert $myroot end] 0] Note "+\tNode $n @ $myroot $what" $mytree set $n type $what $mytree set $n range $range $mytree set $n line $myline $mytree set $n col $mycol return $n } proc PushRoot {x} { Note "Push Root = $x" $myrootstack push $myroot set myroot $x return } proc PopRoot {} { set myroot [$myrootstack pop] Note "Pop Root = $myroot" return } # # ## ### ##### ######## ############# ## Error reporting proc Stop {} { ::variable myerr set ahead [string range $mydata $mypos [expr {$mypos + 30}]] set err [expr {![string length $ahead] ? "eof" : "char"}] set ahead [string map [list \n \\n \t \\t \r \\r] [string range $ahead 0 0]] set caller [lindex [info level -1] 0] set msg "[format $myerr($err) $ahead $caller] at line ${myline}.$mycol" set err [list doctools::tcl::parse $err $mypos $myline $mycol] return -code error -errorcode $err $msg } # # ## ### ##### ######## ############# ## Input processing. Match/peek lexemes, update location after ## stepping over a range. Match = Peek + Step. proc EOF {} { Note "?EOF($mypos >= $mysize) = [expr {$mypos >= $mysize}]" return [expr {$mypos >= $mysize}] } proc Match {pattern rv} { upvar 1 $rv range set ok [Peek $pattern range] if {$ok} {Step $range} return $ok } proc Peek {pattern rv} { upvar 1 $rv range Note Peek($pattern)----|[string map [list "\n" "\\n" "\t" "\\t"] [string range $mydata $mypos [expr {$mypos + 30}]]]| if {[regexp -start $mypos -indices -- $pattern $mydata -> range]} { Note \tOK return 1 } else { Note \tFAIL return 0 } } proc Step {range} { struct::list assign $range a e set mylastpos $mypos set mypos $e incr mypos set pieces [split [string range $mydata $a $e] \n] set delta [string length [lindex $pieces end]] set nlines [expr {[llength $pieces] - 1}] if {$nlines} { incr myline $nlines set mycol $delta } else { incr mycol $delta } return } # # ## ### ##### ######## ############# ## Setup / Shutdown of parser/lexer proc Initialize {t text root} { set mytree $t if {$root eq {}} { set myroot [$t rootname] } else { set myroot $root } if {$myrootstack ne {}} Finalize set myrootstack [struct::stack %AUTO%] $myrootstack clear set mydata $text set mysize [string length $mydata] set mypos 0 set myline 1 set mycol 0 return } proc Finalize {} { $myrootstack destroy set myrootstack {} return } # # ## ### ##### ######## ############# ## Debugging helper commands ## Add ability to disable these. ## For the tree maybe add ability to dump through a callback ? proc Note {text} { upvar 1 range range set m {} append m "$text " if {[info exists range]} { append m "($range) " if {$range != {}} { foreach {a e} $range break append m " = \"[string map [list "\n" "\\n" "\t" "\\t"] \ [string range $mydata $a $e]]\"" } } else { append m "@$mypos ($myline/$mycol)" } #log::debug $m puts $m return } #proc ShowTreeX {args} {} proc ShowTreeX {t x} { puts "=== \[ $x \] [string repeat = [expr {72 - [string length $x] - 9}]]" $t walk root -order pre -type dfs n { set prefix [string repeat .... [$t depth $n]] puts "$prefix$n <[DictSort [$t getall $n]]>" } return } proc Note {args} {} proc ShowTree {args} {} # # ## ### ##### ######## ############# proc DictSort {dict} { array set tmp $dict set res {} foreach k [lsort -dict [array names tmp]] { lappend res $k $tmp($k) } return $res } # # ## ### ##### ######## ############# ## Parser state typevariable mytree {} ; # Tree we are working on typevariable myroot {} ; # Current root to add nodes to. typevariable myrootstack {} typevariable mydata {} ; # String to parse. typevariable mysize 0 ; # Length of string to parse, cache typevariable mylastpos ; # Last current position. typevariable mypos 0 ; # Current parse location, offset from typevariable myline 1 ; # the beginning of the string, line typevariable mycol 0 ; # we are on, and the column within the # line. typevariable myerr -array { char {Unexpected character '%1$s' in %2$s} eof {Unexpected end of input in %2$s} } # # ## ### ##### ######## ############# ## Configuration pragma -hasinstances no ; # singleton pragma -hastypeinfo no ; # no introspection pragma -hastypedestroy no ; # immortal ## # # ## ### ##### ######## ############# } namespace eval ::doctools::tcl { namespace export parse } # # ## ### ##### ######## ############# ##################### ## Ready package provide doctools::tcl::parse 0.1 return