Print

Print


Hi Sebastian,

Here it is the full solution to correct invalid XHTML p tags. It also 
accumulates the largest amount of text and tags that can be enclosed in 
a p element. For instance the following invalid XHTML

<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
   <head>
     <title>Example of converting invalid XHTML to valid XHTML</title>
   </head>
   <body>
     <p>
       Here we have a more <em>complex</em><b> sample </b> of 
<big>invalid</big>
       <acronym>XHTML</acronym>
       <pre>containing pre-formated text
       for instance</pre>
       and a <b>unordered</b> list
       <ul>
         <li>with item 1</li>
         <li>and item 2</li>
       </ul>
       and you see they are converted to <big>valid</big> 
<acronym>XHTML</acronym>
       maximizing the partes enclosed in <b>p</b> tags.
     </p>
   </body>
</html>

will be converted to the following valid XHTML

<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
   <head>
     <title>Example of converting invalid XHTML to valid XHTML</title>
   </head>
   <body>
     <xhtml:p xmlns:xhtml="http://www.w3.org/1999/xhtml">
       Here we have a more <em>complex</em>
       <b> sample </b> of <big>invalid</big>
       <acronym>XHTML</acronym>
     </xhtml:p>
     <pre>containing pre-formated text
       for instance</pre>
     <xhtml:p xmlns:xhtml="http://www.w3.org/1999/xhtml">
       and a <b>unordered</b> list
     </xhtml:p>
     <ul>
       <li>with item 1</li>
       <li>and item 2</li>
     </ul>
     <xhtml:p xmlns:xhtml="http://www.w3.org/1999/xhtml">
       and you see they are converted to <big>valid</big>
       <acronym>XHTML</acronym>
       maximizing the partes enclosed in <b>p</b> tags.
     </xhtml:p>
   </body>
</html>

And here it is the stylesheet:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE xsl:stylesheet [
   <!ENTITY notInsideP "not(self::xhtml:br or self::xhtml:span or
   self::xhtml:em or self::xhtml:strong or
   self::xhtml:dfn or self::xhtml:code or
   self::xhtml:samp or self::xhtml:kbd or
   self::xhtml:var or self::xhtml:cite or
   self::xhtml:abbr or self::xhtml:acronym or
   self::xhtml:q or self::xhtml:tt or
   self::xhtml:i or self::xhtml:b or
   self::xhtml:big or self::xhtml:small or
   self::xhtml:sub or self::xhtml:sup or
   self::xhtml:bdo or self::xhtml:a or
   self::xhtml:img or self::xhtml:map or
   self::xhtml:object or self::xhtml:input or
   self::xhtml:select or self::xhtml:textarea or
   self::xhtml:label or self::xhtml:button or
   self::xhtml:ruby or  self::xhtml:ins or self::xhtml:del or
   self::xhtml:script or self::xhtml:noscript)">
]>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
   version="1.0" xmlns:tei="http://www.tei-c.org/ns/1.0"
   xmlns:xhtml="http://www.w3.org/1999/xhtml" exclude-result-prefixes="tei">
   <xsl:output indent="yes"/>
   <xsl:template match="node() | @*">
     <xsl:copy>
       <xsl:apply-templates select="node() | @*"/>
     </xsl:copy>
   </xsl:template>

   <xsl:template match="xhtml:p[*[&notInsideP;]]">
     <xsl:for-each select="*[&notInsideP;]">
       <xsl:if test="position()=1 and preceding-sibling::node()">
         <xhtml:p><xsl:apply-templates 
select="preceding-sibling::node()"></xsl:apply-templates></xhtml:p>
       </xsl:if>
       <xsl:if test="position()>1">
         <xsl:variable name="precedingNonInP" 
select="preceding-sibling::*[&notInsideP;][1]"/>
         <xsl:variable name="nodes" 
select="$precedingNonInP/following-sibling::node()[
           preceding-sibling::*[&notInsideP;][1]=$precedingNonInP and 
.!=current()
           ]"/>
         <xsl:if test="count($nodes)>0">
           <xhtml:p><xsl:apply-templates select="$nodes"/></xhtml:p>
         </xsl:if>
       </xsl:if>
       <xsl:apply-templates select="."/>
       <xsl:if test="position()=last() and following-sibling::node()">
         <xhtml:p><xsl:apply-templates 
select="following-sibling::node()"></xsl:apply-templates></xhtml:p>
       </xsl:if>
     </xsl:for-each>
   </xsl:template>
</xsl:stylesheet>

Best Regards,
George
---------------------------------------------------------------------
George Cristian Bina
<oXygen/> XML Editor, Schema Editor and XSLT Editor/Debugger
http://www.oxygenxml.com


Sebastian Rahtz wrote:
> I think that's too simplistic, as it would turn <p><b>hello</b></p> into
> <b>hello</b> in its own. Interesting idea, though, of a post-processor
> to clean up; it would be elegant because it could simply be thrown away
> when switching to XHML 2.0. And in an XSLT context it could be
> implemented natively.
> 
> There's lots to think about here. I hope I'm not the only one who's
> going back to his/her XSL and wondering how they can improve it...
> 
> (PS for the record, I have a hybrid policy of turning any <p> which
> contains a list, eg, quote, table etc etc) into a <div>, and leaving
> text-only ones alone. Horrible, eh. I think I'll stop doing it...)