Print

Print


Hi,
 
I posted earlier a small perl script to translate from TEI Lite to
teixlite.
 
Meanwhile, I discovered some bugs and added some enhancements.
 
Now, the script works for multi-line tags.  It eats the whole TEI lite
document in memory and sends an xml document where GI have the correct
case.  It also closes empty tags "a la xml".
 
Improvements are for attributes, as suggested bu Fotis Jannidis.
Attributes are quoted and attribute names are changed to the right case.
 
Please, fill free to modify and fix it (just let me know.)
 
There is also a second small script at the end of the message, used to
produce the tables at the beginning of the program (meta-perl?) With
this script, you can generate larger tables for any variation of full
TEI in XML.
 
  Jean-Daniel Fekete
  Ecole des Mines de Nantes, 4 rue Alfred Kastler, La Chantrerie,
  BP 20722, 44307 Nantes Cedex 03, France
  Voice: +33-2-51-85-82-08  | Fax: +33-2-51-85-82-49
  [log in to unmask] | http://www.emn.fr/fekete/
 
 
#!/usr/local/bin/perl
# File: tei2xtei.pl
# Author: Jean-Daniel Fekete
# Email: [log in to unmask]
#
# Translate a TEI file into a TEI XML file.
#
# Use it like this:
# tei2xtei.pl file.tei > file.xml
 
# The output is "almost" ready for xml. !DOCTYPE and !ENTITY lines
# are not modified though.
# It could be used for non stand-alone files included in other
# TEI files as external entities (header, link groups, etc)
#
 
%xtei_elements = (
        "TEI.2" => 0,
        "gi" => 0,
        "eg" => 0,
        "code" => 0,
        "ident" => 0,
        "kw" => 0,
        "teiHeader" => 0,
        "fileDesc" => 0,
        "titleStmt" => 0,
        "sponsor" => 0,
        "funder" => 0,
        "principal" => 0,
        "editionStmt" => 0,
        "edition" => 0,
        "extent" => 0,
        "publicationStmt" => 0,
        "distributor" => 0,
        "authority" => 0,
        "idno" => 0,
        "availability" => 0,
        "seriesStmt" => 0,
        "notesStmt" => 0,
        "sourceDesc" => 0,
        "encodingDesc" => 0,
        "projectDesc" => 0,
        "samplingDecl" => 0,
        "editorialDecl" => 0,
        "tagsDecl" => 0,
        "tagUsage" => 0,
        "rendition" => 0,
        "refsDecl" => 0,
        "classDecl" => 0,
        "taxonomy" => 0,
        "category" => 0,
        "catDesc" => 0,
        "profileDesc" => 0,
        "creation" => 0,
        "langUsage" => 0,
        "language" => 0,
        "textClass" => 0,
        "keywords" => 0,
        "classCode" => 0,
        "catRef" => 1,
        "revisionDesc" => 0,
        "change" => 0,
        "p" => 0,
        "foreign" => 0,
        "emph" => 0,
        "hi" => 0,
        "q" => 0,
        "cit" => 0,
        "soCalled" => 0,
        "term" => 0,
        "mentioned" => 0,
        "gloss" => 0,
        "name" => 0,
        "rs" => 0,
        "num" => 0,
        "date" => 0,
        "time" => 0,
        "abbr" => 0,
        "sic" => 0,
        "corr" => 0,
        "reg" => 0,
        "orig" => 0,
        "gap" => 1,
        "add" => 0,
        "del" => 0,
        "unclear" => 0,
        "address" => 0,
        "addrLine" => 0,
        "ptr" => 1,
        "ref" => 0,
        "list" => 0,
        "item" => 0,
        "label" => 0,
        "head" => 0,
        "note" => 0,
        "index" => 1,
        "divGen" => 1,
        "milestone" => 1,
        "pb" => 1,
        "lb" => 1,
        "bibl" => 0,
        "biblFull" => 0,
        "listBibl" => 0,
        "author" => 0,
        "editor" => 0,
        "respStmt" => 0,
        "resp" => 0,
        "title" => 0,
        "imprint" => 0,
        "publisher" => 0,
        "biblScope" => 0,
        "pubPlace" => 0,
        "l" => 0,
        "lg" => 0,
        "sp" => 0,
        "speaker" => 0,
        "stage" => 0,
        "text" => 0,
        "body" => 0,
        "group" => 0,
        "div" => 0,
        "div0" => 0,
        "div1" => 0,
        "div2" => 0,
        "div3" => 0,
        "div4" => 0,
        "div5" => 0,
        "div6" => 0,
        "div7" => 0,
        "trailer" => 0,
        "byline" => 0,
        "dateline" => 0,
        "argument" => 0,
        "epigraph" => 0,
        "opener" => 0,
        "closer" => 0,
        "salute" => 0,
        "signed" => 0,
        "front" => 0,
        "titlePage" => 0,
        "docTitle" => 0,
        "titlePart" => 0,
        "docAuthor" => 0,
        "docEdition" => 0,
        "docImprint" => 0,
        "docDate" => 0,
        "back" => 0,
        "xref" => 0,
        "xptr" => 1,
        "seg" => 0,
        "anchor" => 1,
        "interp" => 1,
        "interpGrp" => 0,
        "s" => 0,
        "table" => 0,
        "row" => 0,
        "cell" => 0,
        "formula" => 0,
        "figure" => 0,
        "figDesc" => 0,
);
 
%attributes = (
        "targtype" => "targType",
        "targetend" => "targetEnd",
        "teiform" => "TEIform",
);
 
%translation = ();
 
 
sub empty {
  return $xtei_elements{@_[0]};
}
 
sub quote_attribute {
  if ($_[1] !~ '"') {
    return "$_[0]=\"$_[1]\"";
  } elsif ($_[1] !~ "'") {
    return "$_[0]='$_[1]'";
  } else {
    $_[1] =~ s/"/\\"/g;
    return "$_[0]=\"$_[1]\"";
  }
}
 
sub treat_attribute {
  my $ret = lc($_[0]);
  return exists $attributes{$ret} ? $attributes{$ret} : $ret;
}
 
sub process_attributes {
  $_[0] =~ s/(\w+)=([^\'\"]\S+)/quote_attribute($1,$2)/ge;
  $_[0] =~ s/(\w+)(?==)/treat_attribute($1)/ge;
  return $_[0];
}
 
sub treat_tags {
  my
    $end = shift,
    $gi = shift,
    $rest = shift;
  my $ret = "<$end";
 
#  print STDERR "Treating $gi...";
  if (! exists($xtei_elements{$gi})) {
    my $lc_gi = lc($gi);
    my $new_gi = $translation{$lc_gi};
    if ($new_gi eq "") {
#      print STDERR "Unknow GI '$gi' not treated\n";
    } else {
      $gi = $new_gi;
#      print STDERR "change case to $new_gi\n";
    }
  } else {
#    print STDERR "OK\n";
  }
  $ret .= $gi;
  if ($end eq "") {
    $ret .= process_attributes($rest);
    $ret .= "/" if empty($gi);
  }
  $ret .= ">";
  return $ret;
}
 
foreach $elem (keys %xtei_elements) {
# print STDERR "Translation $elem\n";
  my $u = uc($elem), $l = lc($elem);
 
  if (! ($u eq $elem)) {
    $translation{$u} = $elem;
  }
  if (! ($l eq $elem)) {
    $translation{$l} = $elem;
#    print STDERR "Inserting $l\n";
  }
}
 
undef $/;
$_ = <>;
 
s/<(\/?)([a-zA-Z0-9\._]+)(\s*[^>]*)>/treat_tags($1,$2,$3)/mges;
print;
 
 
#!/usr/local/bin/perl -n
#
# File: mktei2xtei.pl
# Author: Jean-Daniel Fekete
# Email: [log in to unmask]
#
# Parse the teixlite.dtd and translate tei files into xtei files accordingly.
#
# This program should be used like this:
# ./mktei2latex.pl teixlite.dtd
# It produces PERL declarations like this:
#
# %xtei_elements = (
#       "TEI.2" => 0,
#       "gi" => 0,
# ...
# );
#
# %attributes = (
#       "targtype" => "targType",
#       "targetend" => "targetEnd",
#       "teiform" => "TEIform",
# );
#
# These declarations should be inserted in the next script (or replace
# the current ones.
 
 
BEGIN {
    print "\%xtei_elements = (\n";
    %attributes = ();
}
 
if (/^<!ELEMENT\s+([^ ]+)/) {
  print "\t\"$1\" => ";
  if (/ EMPTY>/) {
    print "1,\n";
  } else {
    print "0,\n";
  }
}
 
if (/^\s+(\w+)\s+(IDREF|IRDEFS|ID|CDATA|[A-Z]+)\s+([^ ]+)/) {
  my
    $a = $1,
    $type = $2,
    $def = $3,
    $l = lc($1);
#  print "#adding $a $type $def\n";
  unless ($a eq $l) {
    $attributes{$l} = $a;
  }
}
 
END {
    print ");\n";
 
    print "%attributes = (\n";
    foreach $a (keys %attributes) {
      print "\t\"$a\" => \"$attributes{$a}\",\n";
    }
    print "};\n";
}