ossp-pkg/sugar/srml2sxml
#!/usr/bin/perl
##
## srml2sxml -- Sugar SRML to SXML Translation
## Copyright (c) 1999-2002 Ralf S. Engelschall <rse@engelschall.com>
##
require 5.005;
$|++;
my $tags1d = {
'==' => [ '<span class=h1>', '</span>' ],
'--' => [ '<span class=h2>', '</span>' ],
'~~' => [ '<span class=h3>', '</span>' ],
'..' => [ '<span class=h4>', '</span>' ],
'**' => [ '<b>', '</b>' ],
'//' => [ '<i>', '</i>' ],
'__' => [ '<ul>', '</ul>' ],
'[[' => [ '<boxed>', '</boxed>' ],
'||' => [ '<tt>', '</tt>' ],
'!!' => [ '<verbatim>', '</verbatim>' ],
'(+' => [ '<anchor>', '</anchor>' ],
'->' => [ '<xref>', '</xref>' ],
};
my $alias1d = {
'+)' => '(+',
'<-' => '->',
']]' => '[[',
};
my $tags2d = {
'==' => [ '<h1>', '</h1>' ],
'--' => [ '<h2>', '</h2>' ],
'~~' => [ '<h3>', '</h3>' ],
'..' => [ '<h4>', '</h4>' ],
'**' => [ '<b>', '</b>' ],
'//' => [ '<i>', '</i>' ],
'__' => [ '<ul>', '</ul>' ],
'[[' => [ '<boxed>', '</boxed>' ],
'||' => [ '<pre>', '</pre>' ],
'!!' => [ '<verbatim>', '</verbatim>' ],
'>>' => [ '<blockquote>', '</blockquote>' ],
'((' => [ '<align to=left>', '</align>' ],
'%%' => [ '<align to=center>', '</align>' ],
'))' => [ '<align to=right>', '</align>' ],
'++' => [ '<table>', '</table>' ],
'##' => [ '<comment>', '</comment>' ],
'$$' => [ '<list>', '</list>' ],
'o ' => [ '<item type=ul>', '</item>' ],
'- ' => [ '<item type=ul>', '</item>' ],
'o.' => [ '<item type=ol>', '</item>' ],
'-.' => [ '<item type=ol>', '</item>' ],
'::' => [ '<item type=dl>', '</item>' ],
'Title:' => [ '<title>', '</title>' ],
'Author:' => [ '<author>', '</author>' ],
'Genesis:' => [ '<genesis>', '</genesis>' ],
'Date:' => [ '<date>', '</date>' ],
'Abstract:' => [ '<abstract>', '</abstract>' ],
};
my $alias2d = {
'[]' => '[[',
};
my $escapes = {
'&' => '&',
'<' => '<',
'>' => '>',
};
# assemble mega-regex which matches all 1d-tags
my $tag1d = &assemble_regex((keys(%{$tags1d}),keys(%{$alias1d})));
my $tag2d = &assemble_regex((keys(%{$tags2d}),keys(%{$alias2d})));
sub assemble_regex {
my (@tags) = @_;
my $regex = '(?:';
foreach $tag (sort(@tags)) {
$regex .= quotemeta($tag)."+|";
}
$regex =~ s/\+\|$//;
$regex .= ')';
return $regex;
}
my $escape = '(?:';
foreach $e (sort(%{$escapes})) {
$escape .= quotemeta($e)."|";
}
$escape =~ s/\|$//;
$escape .= ')';
sub expandtab {
my ($ts, $str) = @_;
my $i = 0;
my $k;
$str =~ s|(.)
|($1 eq "\t"
? ($k = $i % $ts,
$i += ($ts - $k),
"*" x ($ts - $k) )
: ($i++, $1)
)
|sgex;
return $str;
}
# <document> ::= <blockgroup>*
# <blockgroup> ::= <block>*
# <block> ::= <para> | <2d-block>
# <2d-block> ::= <2d-tag> <document>
# <2d-tag> ::= m/^\s*XY+\s+/
# <para> ::= <1d-block>*
#
# <blockgroup> und <para> koennen degenerieren
# auf genau einen, d.h. z.B.
# <blockgroup> == 1 <para> oder <para> == 1 <1d-block>
#
# 1d-block:
# foo bar
# **baz
# quux
#
# 2d-block
# ** baz
# quux
#
# -> blank lines zwischen 2d-blocks koennen entfallen!
# -> blank lines wenn da sind markieren paragraphen
# und schliessen 1d block ab
sub parse_srml {
my ($srml) = @_;
my $sxml = '';
while ($srml ne '') {
# stop at end of input
if ($srml =~ m|^(\s*)$|s) {
$sxml .= $1;
$srml = '';
last;
}
# pass through blank lines
elsif ($srml =~ m|^((?: *\n)+)(.*)$|s) {
$sxml .= $1;
$srml = $2;
}
# recognize 2-dimensional block start
elsif ($srml =~ m|^( *)($tag2d)( +[^\n]*)?(.*)$|s) {
my ($prefix, $tag, $block) = ($1, $2, $3);
$block = $prefix . $block;
$srml = $4;
# reduce tag to canonical 2-character form
$tag =~ s|^(.)(.)\2+$|$1$2|s;
if (defined($alias2d->{$tag})) {
$tag = $alias2d->{$tag};
}
# gather block lines
# (same or less indent indicates end)
my $minindent = length($prefix)+1;
while ($srml =~ m/^((?: *| {$minindent,}[^\n]+)\n)(.*)$/s) {
$block .= $1;
$srml = $2;
}
$block = &parse_srml($block);
$block =~ s|^(\s*)(.*?)(\s*)$|
$1 . $tags2d->{$tag}->[0] .
$2 . $tags2d->{$tag}->[1] . $3|se;
$sxml .= $block;
}
# anything else we treat as a 1-dimensional block start
else {
# gather block lines
# (a blank line indicates end)
my $block = '';
while ($srml =~ m/^( *[^ \n][^\n]*\n)(.*)$/s) {
$block .= $1;
$srml = $2;
}
print STDERR "1D<<$block>>\n";
# escape special XML characters
$block =~ s|($escape)|$escapes->{$1}|sge;
# sequencially walk through 1-dimensional block
my $sxml_local = '';
my @active = ();
while ($block =~ m|^(.*?)($tag1d)(.*)$|s) {
my ($prolog, $tag, $epilog) = ($1, $2, $3);
$sxml_local .= $prolog;
$block = $epilog;
# reduce tag to canonical 2-character form
$tag =~ s|^(.)(.)\2+$|$1$2|s;
if (defined($alias1d->{$tag})) {
$tag = $alias1d->{$tag};
}
# look on stack of still active tags
my $active = '';
if ($#active > -1) {
$active = pop(@active);
}
if ($active eq $tag) {
# close the currently active tag
$sxml_local .= $tags1d->{$tag}->[1];
}
else {
# open a new active tag
if ($block =~ m|^\s*$|s) {
# tag wraps around
$sxml_local =~ s|^(\s*)(.*?)(\s*)$|
$1 . $tags1d->{$tag}->[0] .
$2 . $tags1d->{$tag}->[1] . $3|se;
}
else {
$sxml_local .= $tags1d->{$tag}->[0];
push(@active, $active) if ($active ne '');
push(@active, $tag);
}
}
}
# post-processing for implicitly to-be-closed tags
my $trailer = '';
if ($block =~ m|^(.*)(\s+)$|s) {
($block, $trailer) = ($1, $2);
}
$sxml_local .= $block;
while ($#active > -1) {
my $active = pop(@active);
$sxml_local .= $tags1d->{$active}->[1];
}
$sxml_local .= $trailer;
$sxml .= $sxml_local;
}
}
return $sxml;
}
my $srml = '';
$srml .= $_ while (<STDIN>);
$srml =~ s|\n$||s;
$srml .= "\n";
$srml = &expandtab(8, $srml);
my $sxml = '';
$sxml .= "<sugar>";
$sxml .= &parse_srml($srml);
$sxml .= "</sugar>\n";
print $sxml;