|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object org.xml.sax.helpers.DefaultHandler org.knowceans.corpus.parsers.TxtParser
public class TxtParser
SimpleParser parses a set of plain text files into a TextCorpus, one for each document. This implementation uses the "old" parser code and not lucene etc.
Field Summary | |
---|---|
private java.util.Vector<SimpleDocument> |
allDocs
|
private java.io.BufferedWriter |
bw
|
private int |
nr
|
private java.lang.String |
prevWord
|
private Stemmer |
stem
|
private StopWordFilter |
stop
|
boolean |
useBigrams
|
boolean |
useStemming
|
boolean |
useUnigrams
|
private java.lang.String |
xmlfile
|
Constructor Summary | |
---|---|
TxtParser()
|
|
TxtParser(java.lang.String stoplist)
|
Method Summary | |
---|---|
private void |
closeOutfile()
|
void |
configure(boolean useStemming,
boolean useUnigrams,
boolean useBigrams)
configure the parser. |
private boolean |
isValid(java.lang.String sourcefile)
True for txt files. |
static void |
main(java.lang.String[] argv)
|
private void |
openOutfile()
|
java.util.Vector<SimpleDocument> |
parse(java.lang.String file,
int mindl)
opens the file and parses the content as one document |
private java.util.Vector<SimpleDocument> |
parseDir(java.lang.String sourcefile,
int mindl)
Parse directory by adding each XML file's content sequentially. |
private void |
parseString(java.lang.String file,
java.lang.String string,
int mindl)
parses the string |
private int |
parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. |
private java.lang.String |
removePunct(java.lang.String s)
Remove all punctuation |
private void |
setXmlOutput(java.lang.String xmlfile)
|
private void |
writeText(java.lang.String file,
int id,
java.lang.String text)
|
Methods inherited from class org.xml.sax.helpers.DefaultHandler |
---|
characters, endDocument, endElement, endPrefixMapping, error, fatalError, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startDocument, startElement, startPrefixMapping, unparsedEntityDecl, warning |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
private StopWordFilter stop
public boolean useStemming
public boolean useBigrams
public boolean useUnigrams
private java.lang.String prevWord
private int nr
private Stemmer stem
private java.util.Vector<SimpleDocument> allDocs
private java.io.BufferedWriter bw
private java.lang.String xmlfile
Constructor Detail |
---|
public TxtParser()
argv
- public TxtParser(java.lang.String stoplist)
argv
- Method Detail |
---|
public static void main(java.lang.String[] argv)
private void setXmlOutput(java.lang.String xmlfile)
public void configure(boolean useStemming, boolean useUnigrams, boolean useBigrams)
useStemming
- use stemminguseUnigrams
- use unigramsuseBigrams
- use bigramssentencesAsDocs
- meldungenAsDocs
- private java.util.Vector<SimpleDocument> parseDir(java.lang.String sourcefile, int mindl)
sourcefile
- mindl
- minimum doc length
private boolean isValid(java.lang.String sourcefile)
sourcefile
-
public java.util.Vector<SimpleDocument> parse(java.lang.String file, int mindl)
mindl
- minimum required document length
private void parseString(java.lang.String file, java.lang.String string, int mindl)
string
- mindl
- private void openOutfile()
private void writeText(java.lang.String file, int id, java.lang.String text)
private void closeOutfile()
private int parseText(java.lang.String s, java.util.Vector<java.lang.String> words)
s
-
private java.lang.String removePunct(java.lang.String s)
s
-
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |