|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object org.xml.sax.helpers.DefaultHandler org.knowceans.corpus.parsers.dpa.DpaSaxParser2
public class DpaSaxParser2
DpaSaxParser parses a DPA document corpus from a single pseudo-xml file. This version does not validate the content of tags and is robust against encoding problems. Necessary replacements (Xerces will not accept them): & -> & and find all invalid unbalanced tags via regex "\w+ +<\w" and remove them.
This sax parser adheres to a stricter XML format (an interim version of the converted XML by Fraunhofer AIS). It uses a complete directory tree as a source of separate XML files.
TODO: set parser working dir to data files. (now ../dpa.dtd in xml files hints to eclipse workspace.
Field Summary | |
---|---|
private java.util.Vector<DpaDocument> |
allDocs
|
private IptcCategories |
cat
|
private DpaDocument |
curDoc
|
private boolean |
inBody
|
private boolean |
inTitle
|
private java.io.Writer |
out
|
private java.lang.String |
prevWord
|
private java.util.Vector<java.lang.String> |
sentenceBuffer
|
private Stemmer |
stem
|
private StopWordFilter |
stop
|
private java.util.Stack<java.lang.String> |
tagstack
|
boolean |
useBigrams
|
boolean |
useMeldungenAsDocuments
|
private boolean |
usePars
TODO: paragraph processing does not check for "non-paragraph" content, i.e. crashes for paragraphIndex.size() == 0. --> disables; not used for experiments, anyway. |
boolean |
useSentencesAsDocuments
|
boolean |
useStemming
|
boolean |
useUnigrams
|
private boolean |
withinSentence
|
Constructor Summary | |
---|---|
DpaSaxParser2()
|
|
DpaSaxParser2(java.lang.String stoplist)
|
Method Summary | |
---|---|
private void |
aggregateSentences(java.lang.String s)
Adds text to the sentenceBuffer. |
void |
characters(char[] buf,
int offset,
int len)
|
private void |
checkParagraph(java.lang.String s)
Checks if a new paragraph is |
void |
configure(boolean useStemming,
boolean useUnigrams,
boolean useBigrams,
boolean meldungenAsDocs,
boolean sentencesAsDocs)
configure the parser. |
private void |
emit(java.lang.String s)
|
void |
endElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName)
|
void |
error(org.xml.sax.SAXParseException e)
|
void |
fatalError(org.xml.sax.SAXParseException e)
|
static void |
main(java.lang.String[] argv)
|
private void |
nl()
|
private java.util.Vector<DpaDocument> |
parse(java.lang.String file)
|
private java.util.Vector<DpaDocument> |
parseDir(java.lang.String sourcefile)
Parse directory by adding each XML file's content sequentially. |
private int |
parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. |
private java.lang.String |
removePunct(java.lang.String s)
Remove all punctuation |
private java.lang.String |
replaceAbbreviations(java.lang.String s)
|
void |
startElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName,
org.xml.sax.Attributes attrs)
opening tag callback for new DPA dataset (2000) |
Methods inherited from class org.xml.sax.helpers.DefaultHandler |
---|
endDocument, endPrefixMapping, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startDocument, startPrefixMapping, unparsedEntityDecl, warning |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
private java.io.Writer out
private DpaDocument curDoc
private boolean inBody
private boolean inTitle
private boolean withinSentence
private StopWordFilter stop
public boolean useStemming
public boolean useBigrams
public boolean useUnigrams
public boolean useMeldungenAsDocuments
public boolean useSentencesAsDocuments
private java.lang.String prevWord
private IptcCategories cat
private Stemmer stem
private java.util.Vector<DpaDocument> allDocs
private java.util.Stack<java.lang.String> tagstack
private java.util.Vector<java.lang.String> sentenceBuffer
private boolean usePars
Constructor Detail |
---|
public DpaSaxParser2()
argv
- public DpaSaxParser2(java.lang.String stoplist)
argv
- Method Detail |
---|
public static void main(java.lang.String[] argv)
public void configure(boolean useStemming, boolean useUnigrams, boolean useBigrams, boolean meldungenAsDocs, boolean sentencesAsDocs)
useStemming
- use stemminguseUnigrams
- use unigramsuseBigrams
- use bigramssentencesAsDocs
- meldungenAsDocs
- private java.util.Vector<DpaDocument> parse(java.lang.String file)
private java.util.Vector<DpaDocument> parseDir(java.lang.String sourcefile)
sourcefile
-
private void emit(java.lang.String s) throws org.xml.sax.SAXException
org.xml.sax.SAXException
private void nl() throws org.xml.sax.SAXException
org.xml.sax.SAXException
public void startElement(java.lang.String namespaceURI, java.lang.String sName, java.lang.String qName, org.xml.sax.Attributes attrs) throws org.xml.sax.SAXException
startElement
in interface org.xml.sax.ContentHandler
startElement
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
public void endElement(java.lang.String namespaceURI, java.lang.String sName, java.lang.String qName) throws org.xml.sax.SAXException
endElement
in interface org.xml.sax.ContentHandler
endElement
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
public void characters(char[] buf, int offset, int len) throws org.xml.sax.SAXException
characters
in interface org.xml.sax.ContentHandler
characters
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
private void aggregateSentences(java.lang.String s)
s
- private void checkParagraph(java.lang.String s)
s
- private int parseText(java.lang.String s, java.util.Vector<java.lang.String> words)
s
-
private java.lang.String replaceAbbreviations(java.lang.String s)
s
-
private java.lang.String removePunct(java.lang.String s)
s
-
public void error(org.xml.sax.SAXParseException e) throws org.xml.sax.SAXException
error
in interface org.xml.sax.ErrorHandler
error
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
public void fatalError(org.xml.sax.SAXParseException e) throws org.xml.sax.SAXException
fatalError
in interface org.xml.sax.ErrorHandler
fatalError
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |