|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object org.xml.sax.helpers.DefaultHandler org.knowceans.corpus.parsers.igdbib.IgdSaxParser
public class IgdSaxParser
IgdSaxParser parse a IGD library document corpus from a single pseudo-xml file. This version does not validate the content of tags and is robust against encoding problems. Necessary replacements (Xerces will not accept them): and -> & and find all invalid unbalanced tags via regex "\w+ +<\w" and remove them.
Field Summary | |
---|---|
private static java.util.Vector<IgdDocument> |
allDocs
|
private static IgdCategories |
cat
|
private static IgdDocument |
curDoc
|
private static boolean |
inBody
|
private static boolean |
inSentence
|
private static java.io.Writer |
out
|
private static java.lang.String |
prevWord
|
private static Stemmer |
stem
|
private static StopWordFilter |
stop
|
private static java.util.Stack<java.lang.String> |
tagstack
|
boolean |
useBigrams
|
boolean |
useStemming
|
Constructor Summary | |
---|---|
IgdSaxParser()
|
Method Summary | |
---|---|
void |
characters(char[] buf,
int offset,
int len)
|
void |
configure(boolean useStemming,
boolean useBigrams)
configure the parser. |
private void |
emit(java.lang.String s)
|
void |
endElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName)
|
void |
error(org.xml.sax.SAXParseException e)
|
void |
fatalError(org.xml.sax.SAXParseException e)
|
static void |
main(java.lang.String[] argv)
|
private void |
nl()
|
private java.util.Vector<IgdDocument> |
parse(java.lang.String file)
|
private void |
parseAuthors(java.lang.String s,
java.util.Vector<java.lang.String> authors)
parse an author string of the format "surname, givenname (department); surname, ..." |
private void |
parseKeywords(java.lang.String s,
java.util.Vector<java.lang.String> keywords)
parses a keyword string of the format keyword; keyword; ... |
private void |
parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. |
private java.lang.String |
removePunct(java.lang.String s)
Remove all punctuation |
void |
startElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName,
org.xml.sax.Attributes attrs)
|
Methods inherited from class org.xml.sax.helpers.DefaultHandler |
---|
endDocument, endPrefixMapping, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startDocument, startPrefixMapping, unparsedEntityDecl, warning |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
private static java.io.Writer out
private static IgdDocument curDoc
private static boolean inBody
private static boolean inSentence
private static StopWordFilter stop
public boolean useStemming
public boolean useBigrams
private static java.lang.String prevWord
private static IgdCategories cat
private static Stemmer stem
private static java.util.Vector<IgdDocument> allDocs
private static java.util.Stack<java.lang.String> tagstack
Constructor Detail |
---|
public IgdSaxParser()
argv
- Method Detail |
---|
public static void main(java.lang.String[] argv)
public void configure(boolean useStemming, boolean useBigrams)
useStemming
- use stemminguseBigrams
- use bigramsprivate java.util.Vector<IgdDocument> parse(java.lang.String file)
private void emit(java.lang.String s) throws org.xml.sax.SAXException
org.xml.sax.SAXException
private void nl() throws org.xml.sax.SAXException
org.xml.sax.SAXException
public void startElement(java.lang.String namespaceURI, java.lang.String sName, java.lang.String qName, org.xml.sax.Attributes attrs) throws org.xml.sax.SAXException
startElement
in interface org.xml.sax.ContentHandler
startElement
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
public void error(org.xml.sax.SAXParseException e) throws org.xml.sax.SAXException
error
in interface org.xml.sax.ErrorHandler
error
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
public void fatalError(org.xml.sax.SAXParseException e) throws org.xml.sax.SAXException
fatalError
in interface org.xml.sax.ErrorHandler
fatalError
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
public void endElement(java.lang.String namespaceURI, java.lang.String sName, java.lang.String qName) throws org.xml.sax.SAXException
endElement
in interface org.xml.sax.ContentHandler
endElement
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
public void characters(char[] buf, int offset, int len) throws org.xml.sax.SAXException
characters
in interface org.xml.sax.ContentHandler
characters
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
private void parseKeywords(java.lang.String s, java.util.Vector<java.lang.String> keywords)
s
- keywords
- private void parseAuthors(java.lang.String s, java.util.Vector<java.lang.String> authors)
s
- authors
- private void parseText(java.lang.String s, java.util.Vector<java.lang.String> words)
s
- private java.lang.String removePunct(java.lang.String s)
s
-
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |