I am writing a parser for xml that will not have
an associated DTD. I want to be able to handle
certain character references (e.g., ©) in
the program.
When I run the following against a chunk of xml
containing ©, I get the following:
org.xml.sax.SAX ParseException: Reference to undefined entity "©".
at org.apache.crim son.parser.Pars er2.fatal(Parse r2.java:3182)
at org.apache.crim son.parser.Pars er2.fatal(Parse r2.java:3176)
at
org.apache.crim son.parser.Pars er2.expandEntit yInContent(Pars er2.java:2513)
at
org.apache.crim son.parser.Pars er2.maybeRefere nceInContent(Pa rser2.java:2422 )
at org.apache.crim son.parser.Pars er2.content(Par ser2.java:1833)
at org.apache.crim son.parser.Pars er2.maybeElemen t(Parser2.java: 1507)
at org.apache.crim son.parser.Pars er2.content(Par ser2.java:1779)
at org.apache.crim son.parser.Pars er2.maybeElemen t(Parser2.java: 1507)
at org.apache.crim son.parser.Pars er2.content(Par ser2.java:1779)
at org.apache.crim son.parser.Pars er2.maybeElemen t(Parser2.java: 1507)
at org.apache.crim son.parser.Pars er2.parseIntern al(Parser2.java :500)
at org.apache.crim son.parser.Pars er2.parse(Parse r2.java:305)
at org.apache.crim son.parser.XMLR eaderImpl.parse (XMLReaderImpl. java:442)
at javax.xml.parse rs.SAXParser.pa rse(SAXParser.j ava:345)
at javax.xml.parse rs.SAXParser.pa rse(SAXParser.j ava:281)
at Article.main(Ar ticle.java:18)
What can I do to catch these references in my code and output replacement
text for it?
Thanks.
Dean Hoover
Here's the two java files:
---
import java.io.*;
import javax.xml.parse rs.*;
import org.xml.sax.*;
import org.xml.sax.hel pers.*;
public class Article
{
public static void main(String argv[])
{
String file = argv[0];
PrintWriter pw = new PrintWriter(Sys tem.out);
DefaultHandler handler = new LoadXML(pw, LoadXML.TYPE_HT ML);
SAXParserFactor y factory = SAXParserFactor y.newInstance() ;
try
{
SAXParser reader = factory.newSAXP arser();
reader.parse(ne w File(file), handler);
}
catch (Exception e)
{
e.printStackTra ce();
return;
}
pw.flush();
}
}
---
import java.io.*;
import java.util.*;
import javax.xml.parse rs.*;
import org.xml.sax.*;
import org.xml.sax.hel pers.*;
public class LoadXML extends DefaultHandler
{
public static final int TYPE_HTML = 1;
public static final int TYPE_TEXT = 2;
public LoadXML
(
java.io.Writer writer,
int type
)
{
elements_ = new Stack();
writer_ = writer;
type_ = type;
}
public InputSource resolveEntity
(
String publicId,
String systemId
) throws SAXException
{
String s = "stuff";
return new InputSource(new CharArrayReader (s.toCharArray( )));
}
public void startDocument() throws SAXException
{
}
public void endDocument() throws SAXException
{
}
public void startElement
(
String uri,
String localName,
String qName,
Attributes attributes
) throws SAXException
{
String elementName = qName;
elements_.push( elementName);
try
{
if (elementName.eq uals("p"))
{
if (type_ == TYPE_HTML)
writer_.write(" <p class=\"article-text\">");
}
else if (elementName.eq uals("title"))
{
if (type_ == TYPE_HTML)
writer_.write(" <p class=\"article-title\">");
}
else if (elementName.eq uals("by"))
{
if (type_ == TYPE_HTML)
writer_.write(" <p class=\"article-by\">");
}
else if (elementName.eq uals("copyright "))
{
if (type_ == TYPE_HTML)
writer_.write(" <p class=\"article-copyright\">");
}
}
catch (IOException e)
{
throw new SAXException(e) ;
}
}
public void endElement
(
String uri,
String localName,
String qName
) throws SAXException
{
String elementName = qName;
elements_.pop() ;
try
{
if (type_ == TYPE_HTML)
{
if (elementName.eq uals("p") || elementName.equ als("title") ||
elementName.equ als("by") || elementName.equ als("copyright" ))
{
writer_.write(" </p>\n");
}
else if (elementName.eq uals("br"))
{
writer_.write(" <br/>\n");
}
}
}
catch (IOException e)
{
throw new SAXException(e) ;
}
}
public void characters
(
char[] ch,
int start,
int length
) throws SAXException
{
try
{
String content = new String(ch, start, length);
String top = (String)element s_.peek();
String text =
content.replace All("\n", " ").replaceA ll(" +", " ").trim();
if (text.length() == 0)
return;
if (type_ == TYPE_HTML)
{
if (top.equals("p" ) || top.equals("tit le") ||
top.equals("by" ) || top.equals("cop yright"))
writer_.write(t ext);
}
}
catch (IOException e)
{
throw new SAXException(e) ;
}
}
private Stack elements_;
private java.io.Writer writer_;
private int type_;
}
an associated DTD. I want to be able to handle
certain character references (e.g., ©) in
the program.
When I run the following against a chunk of xml
containing ©, I get the following:
org.xml.sax.SAX ParseException: Reference to undefined entity "©".
at org.apache.crim son.parser.Pars er2.fatal(Parse r2.java:3182)
at org.apache.crim son.parser.Pars er2.fatal(Parse r2.java:3176)
at
org.apache.crim son.parser.Pars er2.expandEntit yInContent(Pars er2.java:2513)
at
org.apache.crim son.parser.Pars er2.maybeRefere nceInContent(Pa rser2.java:2422 )
at org.apache.crim son.parser.Pars er2.content(Par ser2.java:1833)
at org.apache.crim son.parser.Pars er2.maybeElemen t(Parser2.java: 1507)
at org.apache.crim son.parser.Pars er2.content(Par ser2.java:1779)
at org.apache.crim son.parser.Pars er2.maybeElemen t(Parser2.java: 1507)
at org.apache.crim son.parser.Pars er2.content(Par ser2.java:1779)
at org.apache.crim son.parser.Pars er2.maybeElemen t(Parser2.java: 1507)
at org.apache.crim son.parser.Pars er2.parseIntern al(Parser2.java :500)
at org.apache.crim son.parser.Pars er2.parse(Parse r2.java:305)
at org.apache.crim son.parser.XMLR eaderImpl.parse (XMLReaderImpl. java:442)
at javax.xml.parse rs.SAXParser.pa rse(SAXParser.j ava:345)
at javax.xml.parse rs.SAXParser.pa rse(SAXParser.j ava:281)
at Article.main(Ar ticle.java:18)
What can I do to catch these references in my code and output replacement
text for it?
Thanks.
Dean Hoover
Here's the two java files:
---
import java.io.*;
import javax.xml.parse rs.*;
import org.xml.sax.*;
import org.xml.sax.hel pers.*;
public class Article
{
public static void main(String argv[])
{
String file = argv[0];
PrintWriter pw = new PrintWriter(Sys tem.out);
DefaultHandler handler = new LoadXML(pw, LoadXML.TYPE_HT ML);
SAXParserFactor y factory = SAXParserFactor y.newInstance() ;
try
{
SAXParser reader = factory.newSAXP arser();
reader.parse(ne w File(file), handler);
}
catch (Exception e)
{
e.printStackTra ce();
return;
}
pw.flush();
}
}
---
import java.io.*;
import java.util.*;
import javax.xml.parse rs.*;
import org.xml.sax.*;
import org.xml.sax.hel pers.*;
public class LoadXML extends DefaultHandler
{
public static final int TYPE_HTML = 1;
public static final int TYPE_TEXT = 2;
public LoadXML
(
java.io.Writer writer,
int type
)
{
elements_ = new Stack();
writer_ = writer;
type_ = type;
}
public InputSource resolveEntity
(
String publicId,
String systemId
) throws SAXException
{
String s = "stuff";
return new InputSource(new CharArrayReader (s.toCharArray( )));
}
public void startDocument() throws SAXException
{
}
public void endDocument() throws SAXException
{
}
public void startElement
(
String uri,
String localName,
String qName,
Attributes attributes
) throws SAXException
{
String elementName = qName;
elements_.push( elementName);
try
{
if (elementName.eq uals("p"))
{
if (type_ == TYPE_HTML)
writer_.write(" <p class=\"article-text\">");
}
else if (elementName.eq uals("title"))
{
if (type_ == TYPE_HTML)
writer_.write(" <p class=\"article-title\">");
}
else if (elementName.eq uals("by"))
{
if (type_ == TYPE_HTML)
writer_.write(" <p class=\"article-by\">");
}
else if (elementName.eq uals("copyright "))
{
if (type_ == TYPE_HTML)
writer_.write(" <p class=\"article-copyright\">");
}
}
catch (IOException e)
{
throw new SAXException(e) ;
}
}
public void endElement
(
String uri,
String localName,
String qName
) throws SAXException
{
String elementName = qName;
elements_.pop() ;
try
{
if (type_ == TYPE_HTML)
{
if (elementName.eq uals("p") || elementName.equ als("title") ||
elementName.equ als("by") || elementName.equ als("copyright" ))
{
writer_.write(" </p>\n");
}
else if (elementName.eq uals("br"))
{
writer_.write(" <br/>\n");
}
}
}
catch (IOException e)
{
throw new SAXException(e) ;
}
}
public void characters
(
char[] ch,
int start,
int length
) throws SAXException
{
try
{
String content = new String(ch, start, length);
String top = (String)element s_.peek();
String text =
content.replace All("\n", " ").replaceA ll(" +", " ").trim();
if (text.length() == 0)
return;
if (type_ == TYPE_HTML)
{
if (top.equals("p" ) || top.equals("tit le") ||
top.equals("by" ) || top.equals("cop yright"))
writer_.write(t ext);
}
}
catch (IOException e)
{
throw new SAXException(e) ;
}
}
private Stack elements_;
private java.io.Writer writer_;
private int type_;
}
Comment