org.knowceans.corpus.parsers.dpa
Class DpaSaxParser
java.lang.Object
org.xml.sax.helpers.DefaultHandler
org.knowceans.corpus.parsers.dpa.DpaSaxParser
- All Implemented Interfaces:
- org.xml.sax.ContentHandler, org.xml.sax.DTDHandler, org.xml.sax.EntityResolver, org.xml.sax.ErrorHandler
public class DpaSaxParser
- extends org.xml.sax.helpers.DefaultHandler
DpaSaxParser parses a DPA document corpus from a single pseudo-xml file. This
version does not validate the content of tags and is robust against encoding
problems. Necessary replacements (Xerces will not accept them): & -> &
and find all invalid unbalanced tags via regex "\w+ +<\w" and remove them.
- Author:
- heinrich
Method Summary |
private void |
aggregateSentences(java.lang.String s)
Adds text to the sentenceBuffer. |
void |
characters(char[] buf,
int offset,
int len)
|
void |
configure(boolean useStemming,
boolean useUnigrams,
boolean useBigrams,
boolean meldungenAsDocs,
boolean sentencesAsDocs)
configure the parser. |
private void |
emit(java.lang.String s)
|
void |
endElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName)
|
void |
error(org.xml.sax.SAXParseException e)
|
void |
fatalError(org.xml.sax.SAXParseException e)
|
static void |
main(java.lang.String[] argv)
|
private void |
nl()
|
private java.util.Vector<DpaDocument> |
parse(java.lang.String file)
|
private int |
parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. |
private java.lang.String |
removePunct(java.lang.String s)
Remove all punctuation |
private java.lang.String |
replaceAbbreviations(java.lang.String s)
|
void |
startElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName,
org.xml.sax.Attributes attrs)
|
Methods inherited from class org.xml.sax.helpers.DefaultHandler |
endDocument, endPrefixMapping, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startDocument, startPrefixMapping, unparsedEntityDecl, warning |
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
out
private java.io.Writer out
curDoc
private DpaDocument curDoc
inBody
private boolean inBody
withinSentence
private boolean withinSentence
stop
private StopWordFilter stop
useStemming
public boolean useStemming
useBigrams
public boolean useBigrams
useUnigrams
public boolean useUnigrams
useMeldungenAsDocuments
public boolean useMeldungenAsDocuments
useSentencesAsDocuments
public boolean useSentencesAsDocuments
prevWord
private java.lang.String prevWord
cat
private DpaCategories cat
stem
private Stemmer stem
allDocs
private java.util.Vector<DpaDocument> allDocs
tagstack
private java.util.Stack<java.lang.String> tagstack
sentenceBuffer
private java.util.Vector<java.lang.String> sentenceBuffer
DpaSaxParser
public DpaSaxParser()
- Parameters:
argv
-
DpaSaxParser
public DpaSaxParser(java.lang.String stoplist)
- Parameters:
argv
-
main
public static void main(java.lang.String[] argv)
configure
public void configure(boolean useStemming,
boolean useUnigrams,
boolean useBigrams,
boolean meldungenAsDocs,
boolean sentencesAsDocs)
- configure the parser.
- Parameters:
useStemming
- use stemminguseUnigrams
- use unigramsuseBigrams
- use bigramssentencesAsDocs
- meldungenAsDocs
-
parse
private java.util.Vector<DpaDocument> parse(java.lang.String file)
emit
private void emit(java.lang.String s)
throws org.xml.sax.SAXException
- Throws:
org.xml.sax.SAXException
nl
private void nl()
throws org.xml.sax.SAXException
- Throws:
org.xml.sax.SAXException
startElement
public void startElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName,
org.xml.sax.Attributes attrs)
throws org.xml.sax.SAXException
- Specified by:
startElement
in interface org.xml.sax.ContentHandler
- Overrides:
startElement
in class org.xml.sax.helpers.DefaultHandler
- Throws:
org.xml.sax.SAXException
endElement
public void endElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName)
throws org.xml.sax.SAXException
- Specified by:
endElement
in interface org.xml.sax.ContentHandler
- Overrides:
endElement
in class org.xml.sax.helpers.DefaultHandler
- Throws:
org.xml.sax.SAXException
characters
public void characters(char[] buf,
int offset,
int len)
throws org.xml.sax.SAXException
- Specified by:
characters
in interface org.xml.sax.ContentHandler
- Overrides:
characters
in class org.xml.sax.helpers.DefaultHandler
- Throws:
org.xml.sax.SAXException
aggregateSentences
private void aggregateSentences(java.lang.String s)
- Adds text to the sentenceBuffer. Removes dots from abbreviations and
splits at sentence full stops.
- Parameters:
s
-
parseText
private int parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
- Parse the given text and add terms to the model. Here stop-words and stem
filtering is located.
- Parameters:
s
-
- Returns:
- number of terms added to words.
replaceAbbreviations
private java.lang.String replaceAbbreviations(java.lang.String s)
- Parameters:
s
-
- Returns:
removePunct
private java.lang.String removePunct(java.lang.String s)
- Remove all punctuation
- Parameters:
s
-
- Returns:
error
public void error(org.xml.sax.SAXParseException e)
throws org.xml.sax.SAXException
- Specified by:
error
in interface org.xml.sax.ErrorHandler
- Overrides:
error
in class org.xml.sax.helpers.DefaultHandler
- Throws:
org.xml.sax.SAXException
fatalError
public void fatalError(org.xml.sax.SAXParseException e)
throws org.xml.sax.SAXException
- Specified by:
fatalError
in interface org.xml.sax.ErrorHandler
- Overrides:
fatalError
in class org.xml.sax.helpers.DefaultHandler
- Throws:
org.xml.sax.SAXException