|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object org.knowceans.corpus.parsers.nips.NipsXmlReader
public class NipsXmlReader
NipsXmlReader parsers one XML file converted from NIPS PDF documents using XPDF-basd pdftohtml
Field Summary | |
---|---|
(package private) java.lang.String |
_bold
|
(package private) java.lang.String |
_fontspec
|
(package private) java.lang.String |
_italics
|
(package private) java.lang.String |
_page
|
(package private) java.lang.String |
_text
|
(package private) java.util.regex.Pattern |
boldpattern
|
(package private) java.util.HashMap<java.lang.String,java.lang.Double> |
fontsizes
|
(package private) java.util.regex.Pattern |
fontspecpattern
|
private boolean |
includerefs
whether to include references in the text |
private boolean |
inrefs
|
(package private) java.util.regex.Pattern |
italicspattern
|
(package private) java.util.regex.Pattern |
pagepattern
|
(package private) java.util.regex.Pattern |
textpattern
|
Constructor Summary | |
---|---|
NipsXmlReader()
initialise reader (e.g., compile regex patterns) |
Method Summary | |
---|---|
private java.lang.String |
clean(java.lang.String in)
cleans the string. |
private java.lang.String |
cleanUp(java.lang.String in)
|
void |
extract(java.lang.String filename,
NipsDocument doc)
|
private java.util.Vector<java.lang.String> |
extractPage(java.lang.StringBuffer content)
extracts the content of a page. |
private java.lang.String[] |
getHead(java.lang.StringBuffer content)
extract title, authors and abstract from (the first page of a) document. |
private java.util.Vector<java.lang.String> |
getPages(java.lang.StringBuffer content)
|
static void |
main(java.lang.String[] args)
|
private void |
processText(java.lang.StringBuffer content,
NipsDocument doc)
|
private java.lang.String |
replaceUmlauts(java.lang.String in)
inserts umlauts TODO: direct unicode multicharacter replacements |
private int |
setFonts(java.lang.StringBuffer content)
sets the font sizes for the current page |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
java.lang.String _page
java.lang.String _fontspec
java.lang.String _text
java.lang.String _bold
java.lang.String _italics
java.util.regex.Pattern pagepattern
java.util.regex.Pattern fontspecpattern
java.util.regex.Pattern textpattern
java.util.regex.Pattern boldpattern
java.util.regex.Pattern italicspattern
java.util.HashMap<java.lang.String,java.lang.Double> fontsizes
private boolean inrefs
private boolean includerefs
Constructor Detail |
---|
public NipsXmlReader()
Method Detail |
---|
public static void main(java.lang.String[] args)
public void extract(java.lang.String filename, NipsDocument doc)
filename
- doc
- document record (existing or will be created)private void processText(java.lang.StringBuffer content, NipsDocument doc)
content
- private java.lang.String[] getHead(java.lang.StringBuffer content)
content
- the content that the heading data is extracted from. The
extracted parts are stripped from
private java.lang.String cleanUp(java.lang.String in)
in
-
private int setFonts(java.lang.StringBuffer content)
content
-
private java.lang.String replaceUmlauts(java.lang.String in)
in
-
private java.lang.String clean(java.lang.String in)
in
-
private java.util.Vector<java.lang.String> getPages(java.lang.StringBuffer content)
content
-
private java.util.Vector<java.lang.String> extractPage(java.lang.StringBuffer content)
content
-
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |