blob: f8d7282064c58e3f47726e19878f247c2007fc6b [file] [log] [blame]
#!/usr/bin/perl
## Licensed to the Apache Software Foundation (ASF) under one
## or more contributor license agreements. See the NOTICE file
## distributed with this work for additional information
## regarding copyright ownership. The ASF licenses this file
## to you under the Apache License, Version 2.0 (the
## "License"); you may not use this file except in compliance
## with the License. You may obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
# Grammar into HTML
# Read in sparql.txt and the tokens.txt file
## ToDo:
## Check tokens exist and are used
## Validate
if ( $#ARGV != 1 )
{
print STDERR "Usage: grammar.txt tokens.txt\n" ;
exit 1 ;
}
$/ = undef ;
# Just table or full page.
$TABLE = 1 ;
$grammarFile = $ARGV[0] ;
$tokensFile = $ARGV[1] ;
$grammar = &readFile($grammarFile) ;
$tokens = &readFile($tokensFile) ;
$grammar =~ s!DOCUMENT START!! ;
# $grammar =~ s!NON-TERMINALS!! ;
$grammar =~ s!DOCUMENT END!! ;
$grammar =~ s!TOKENS.*NON-TERMINALS!!s ;
$grammar =~ s!//.*!!g ;
$grammar =~ s!\r!!g ;
# remove leading whitespace
$grammar =~ s!^[\n\s]*!\n! ;
# Merge alts
$grammar =~ s!\n\s*\|!\ |!g ;
$tokens =~ s!//.*!!g ;
$tokens =~ s!\r!!g ;
## Grammar
#print "GRAMMAR\n" ;
@g = split(/\n\s*/, $grammar) ;
@rules = () ;
%ruleMap = () ;
%tokenMap = () ;
%inline = () ;
# Grammar rules
# Direct from "jjdoc -TEXT=true"
for $g (@g)
{
($rulename, $rulebody) = split(/:=/,$g) ;
$rulename =~ s!^\s*!! ;
$rulename =~ s!\s*$!! ;
$rulebody =~ s!^\s*!! ;
$rulebody =~ s!\s*$!! ;
# Remove outer brackets
# $rulebody =~ s!^\((.*)\)$!$1! ;
# Remove <> around tokens in grammar.
## Now done very late (as &lt;&gt;) in fixups.
## $rulebody =~ s/\<(\w+)\>/$1/g ;
# Leave in - so tokens distinguished from rules
next if $rulename eq '' ;
#next if $rulebody eq '' ;
# Skip the root rule.
next if ( $rulename eq 'CompilationUnit' ) ;
$rulebody = 'Perl 5 regular expression'
if ( $rulename eq 'PatternLiteral' ) ;
push @rules, $rulename ;
warn "Duplicate rule (grammar): $rulename\n" if defined($ruleMap{$rulename}) ;
$ruleMap{$rulename} = $rulebody ;
## print "----------\n" ;
## print $rulename,"\n" ;
## print $rulebody,"\n" ;
}
# Tokens
# Produced by "jj2tokens"
# Hand edited to indicate the inlines
$tokens =~ s/\n+/\n/g ;
$tokens =~ s/^\n// ;
@t = split(/\n(?=\<|\[)/, $tokens) ;
for $t (@t)
{
($tokenname,$tokenbody) = split(/::=/, $t) ;
$tokenname =~ s!^\s*!! ;
$tokenname =~ s!\s*$!! ;
## # remove <> around tokens
## Do very late as a formatting fix up.
## $tokenname =~ s/^\<// ;
## $tokenname =~ s/\>$// ;
$tokenname =~ s/#// ;
$tokenbody =~ s!^\s*!! ;
$tokenbody =~ s!\s*$!! ;
# <> round tokens
# Remove at last minute.
# Remove outer ()
# $tokenbody =~ s!^\((.*)\)$!$1! ;
# Inline?
if ( $tokenname =~ /^\[\<\w*\>\]/ )
{
warn "Duplicate inline (token): $tokenname\n" if defined($inline{$tokenname}) ;
$tokenname =~ s/^\[//g ;
$tokenname =~ s/\]$//g ;
$tokenbody =~ s/"/'/g ; # '" -- But not literal " -- how?
$tokenbody =~ s/\<\>\'\{\}/\<\>\"\{\}/ ; # '" IRI fixup
$inline{$tokenname} = $tokenbody ;
#print "INLINE: ",$tokenname," => ",$tokenbody,"\n" ;
}
else
{
push @rules, $tokenname ;
warn "Duplicate rule (token): $tokenname\n" if defined($tokenMap{$tokenname}) ;
$ruleMap{$tokenname} = $tokenbody ;
}
}
# Table
if ( ! $TABLE )
{
print "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" ;
print "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n" ;
print " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" ;
print "\n" ;
print "<html>\n";
print "<head>\n";
print "<title>SPARQL Grammar</title>\n" ;
print "<style type=\"text/css\">\n" ;
# .token inline
# .ruleHead
# .ruleBody
print <<'EOF' ;
div.grammarTable table * { border-width: 0 ; }
div.grammarTable table * tr { border: 1px solid black ; }
.grammar { text-align: left ; vertical-align: top ; }
.token { color: #3f3f5f; }
.gRuleHead { font-style: italic ; font-family: monospace ; }
.gRuleBody { font-family: monospace ; }
.gRuleLabel { font-family: monospace ; }
EOF
print "</style>\n" ;
print "</head>\n";
print "<body>\n";
print "\n" ;
}
print "<div class=\"grammarTable\">\n" ;
print " <table><tbody>\n" ;
$ruleNum = 0 ;
for $r (@rules)
{
$DEBUG = 0 ;
$ruleNum++ ;
$rulename = $r ;
$rulebody = $ruleMap{$rulename} ;
## $DEBUG = 1 if ( $rulename =~ /Prolog/ ) ;
$rb = $rulebody ;
if ( $DEBUG )
{
print STDERR "\n" ;
print STDERR "Rule: $rulename\n" ;
print STDERR "Body: $rulebody\n" ;
}
## Do before '||' substitution
# Not perfect - some fixups later.
#$rb =~ s%\|%\<br/\>\|%g ;
# Escape HTML chars before adding markup.
$rb = esc($rb) ;
# Inlines
for $k (keys %inline)
{
$s = span('token', $inline{$k}) ;
$k = esc($k) ;
# Assumes escaped <> round tokens.
$k = quotemeta $k ;
$rb =~ s/$k/$s/g ;
}
if ( $DEBUG )
{
print STDERR "After inlining\n" ;
print STDERR $rb,"\n" ; ;
}
# Add hrefs - issue if one is a substring of another \W helps.
for $k (keys %ruleMap)
{
$s = href("r-".$k,$k) ;
$k = esc($k) ;
$k = quotemeta $k ;
## if ( $DEBUG )
## {
## print STDERR "K:$k\n" ;
## }
$rb =~ s/(?=\W)(\s*)$k(\s*)(?=\W)/$1$s$2/g ;
$rb =~ s/^$k(\s*)(?=\W)/$s$1/g ;
$rb =~ s/(?=\W)(\s*)$k$/$1$s/g ;
$rb =~ s/^$k$/$s/g ;
}
if ( $DEBUG )
{
print STDERR "After hrefs\n" ;
print STDERR $rb,"\n" ; ;
}
#exit if $ruleNum > 2 ;
$rn = anchor("r-".$rulename, $rulename) ;
$rn = fixupHead($rn) ;
if($rulename eq 'IRIREF')
{
print " </tbody></table>\n" ;
print "</div>\n" ;
print "<p>Productions for terminals:</p>\n" ;
print "<div class=\"grammarTable\">\n" ;
print " <table><tbody>\n" ;
}
print "\n" ;
print "<tr valign=\"baseline\">\n" ;
$rlabel = '[' . $ruleNum . ']&nbsp;&nbsp;' ;
print " <td>",code('gRuleLabel', $rlabel),"</td>\n" ;
#print " <td>",span('gRuleHead', $rn),"</td>\n" ;
print " <td>",code('gRuleHead',$rn),"</td>\n" ;
print " <td>&nbsp;&nbsp;::=&nbsp;&nbsp;</td>\n" ;
$rb = fixupRule($rulename, $rb) ;
print " <td>",code('gRuleBody',$rb),"</td>\n" ;
print "</tr>\n" ;
# $rule{$rulename, $rulebody) ;
# print $rulename , "\n" ;
}
print " </tbody></table>\n" ;
print "</div>\n" ;
if ( !$TABLE )
{
print "\n" ;
print "</body>\n" ;
print "</html>\n" ;
}
sub readFile
{
my $f = $_[0] ;
open(F, "$f") || die "$!";
my $s = <F> ;
return $s ;
}
sub esc
{
my $s = $_[0] ;
$s =~ s/&/&amp;/g ;
$s =~ s/</&lt;/g ;
$s =~ s/>/&gt;/g ;
return $s ;
}
sub span
{
my $c = $_[0] ;
my $t = $_[1] ;
$t = esc($t) ;
my $s = '<span class="' . $c . '">' . $t . '</span>' ;
return $s ;
}
sub href
{
my $a = $_[0] ;
my $t = $_[1] ;
$a = sane($a) ;
$t = esc($t) ;
my $s = '<a href="#' . $a . '">' . $t . '</a>' ;
return $s ;
}
sub anchor
{
my $a = $_[0] ;
my $t = $_[1] ;
$a = sane($a) ;
$t = esc($t) ;
my $s = '<a id="' . $a . '" name="' . $a . '">' . $t . '</a>' ;
return $s ;
}
sub sane
{
my $a = $_[0] ;
$a =~ s/\W//g ;
return $a ;
}
sub code
{
my $c = $_[0] ;
my $t = $_[1] ;
return '<code class="' . $c . '">' . $t . '</code>' ;
}
sub fixupHead
{
my $head = $_[0] ;
# Remove <> around tokens.
$head =~ s/&lt;(\w+)&gt;/$1/g ;
return $head ;
}
sub fixupRule
{
my $head = $_[0] ;
my $body = $_[1] ;
# Remove unnecessary ()
$body =~ s/\(\s*([^()| ]*) \)/$1/g ;
## if ( $body =~ m!\(\s+(\<a[^>]*\>[^<>]*\</a\>)\s+\)! )
## {
## $b = $body ;
## print "================================\n" ;
## print STDERR "$b\n" ;
## print STDERR "--------\n" ;
## $b =~ s!\(\s+(\<a[^>]*\>[^<>]*\</a\>)\s+\)!$1!g ;
## $b =~ s!\(\s+(\<span[^>]*\>[^<>]*\</span\>)\s+\)!$1!g ;
## print STDERR "$b\n" ;
## print STDERR "=====\n" ;
## print STDERR "\n" ;
## }
# Remove outer matching () where there are no inner ()
$body =~ s/^\(\s+([^\(]*)\s+\)$/$1/ ;
# ( A )* => A* and for + and ? where A is a linked or spanned object
$body =~ s!\(\s+(\<a[^>]*\>[^<>]*\</a\>)\s+\)!$1!g ;
$body =~ s!\(\s+(\<span[^>]*\>[^<>]*\</span\>)\s+\)!$1!g ;
# There aren't any of these
## $body =~ s!\(\s+(\S*)\s+\)!$1!g ;
# Remove <> around tokens.
$body =~ s/&lt;(\w+)&gt;/$1/g ;
# Specials
# Split long bodies
if ( $head eq "CallExpression" ||
$head eq "UnaryExpression" ||
$head eq "<NCCHAR1p>" ||
$head eq "PatternElement" ||
$head eq "BuiltInCall" )
{
$body =~ s%\|%\<br/\>\|%g ;
$body =~ s/^\s+// ;
$body = "&nbsp;&nbsp;".$body ;
}
if ( $head eq "Aggregate" )
{
# Strip outer ()
$body =~ s/^\(\s*(.*)\s*\)$/$1/ ;
## Be careful of nested | in COUNT
$body =~ s%(\| \<span class="token")%\<br/\>$1%g ;
$body =~ s/^\s+// ;
$body = "&nbsp;&nbsp;".$body ;
}
if ( $head eq "BuiltInCall" )
{
# Undo <br/> for BNODE, RAND
# <br/>| <a href="#rNIL">NIL</a> )
$body =~ s%\<br/\>\| *\<a href="#rNIL"\>NIL\</a\>%\| \<a href="#rNIL"\>NIL\</a\>%g ;
}
if ( $head eq "RelationalExpression" ||
$head eq "AdditiveExpression" ||
$head eq "MultiplicativeExpression" ||
$head eq "ConditionalOrExpression")
{
$body =~ s%\*\(%<br/>\(% ;
}
# These failed the outer () test because they have nested () in them
if ( $head eq "QueryPattern" ||
$head eq "OrderCondition" )
{
# Remove outer ()
$body =~ s/^\((.*)\)$/$1/ ;
}
if ( $head eq "Query" )
{
$body =~ s! \(!<br/>\(! ;
$body =~ s!\) !\)<br/>! ;
}
if ( $head =~ m/(Select|Construct|Describe|Ask)Query/ )
{
# Put a line break before the DatasetClause
# <a href="#rDatasetClause">DatasetClause</a>
$c = '<a href="#rDatasetClause">DatasetClause</a>' ;
$c = quotemeta $c ;
# Expects the dataset clause to be unbracketted
$body =~ s!(\(\s*$c)!<br/>$1! ;
}
if ( $head eq "OrderCondition" )
{
$body =~ s!\)\s*\|\s*\(!\)<br/>\| \(! ;
$body = " ".$body ;
}
#Rules where an outer () is unnecessary.
if ( $head eq "GroupCondition" ||
$head eq "LimitOffsetClauses" ||
$head eq "GraphOrDefault" ||
$head eq "ArgList" ||
$head eq "ExpressionList" ||
$head eq "PathPrimary" ||
$head eq "PathMod" ||
$head eq "PathPrimary" ||
$head eq "PathNegatedPropertySet" ||
$head eq "PathOneInPropertySet")
{
$body =~ s/^\(\s*(.*)\s*\)$/$1/ ;
}
return $body ;
}