Skip to content

Commit 858cbaa

Browse files
author
Ram Kumar Hariharan
committedOct 6, 2010
Completed and ran the Linguistic Feature Extractor independently
1 parent 0bfac77 commit 858cbaa

File tree

3 files changed

+1009
-6
lines changed

3 files changed

+1009
-6
lines changed
 

‎.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
*.class
33
stanford-parser-2010-08-20
44
output/*
5+
bin/

‎data/train_smalldataset

+1,000
Large diffs are not rendered by default.

‎src/LinguisticFeatureExtractor.java

+8-6
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,13 @@ public class LinguisticFeatureExtractor implements FeatureExtractor{
2323
private FastVector attrs;
2424
private FastVector pos_relations;
2525
private static final int OFFSET = 1;
26+
private static LexicalizedParser lp;
2627

2728
private void setupAttributes(List<Tweet> tweets)
2829
{
30+
lp = new LexicalizedParser("res/englishPCFG.ser.gz");
31+
lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"});
32+
2933
attrs = new FastVector();
3034
// Determine attributes
3135
FastVector sentvals = new FastVector();
@@ -59,14 +63,14 @@ public Instances extractFeatures(List<Tweet> tweets) {
5963
GrammaticalStructureFactory gsf;
6064
GrammaticalStructure gs;
6165
Collection tdl;
62-
LexicalizedParser lp = new LexicalizedParser("res/englishPCFG.ser.gz");
63-
lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"});
66+
6467
tlp = new PennTreebankLanguagePack();
6568
gsf = tlp.grammaticalStructureFactory();
66-
69+
6770
for(Tweet t: tweets)
6871
{
6972
Instance inst = new Instance(1.0, new double[attrs.size()]);
73+
inst.setDataset(feats);
7074
inst.setClassValue(t.sentiment);
7175

7276
st = new StringTokenizer(t.text);
@@ -87,7 +91,6 @@ public Instances extractFeatures(List<Tweet> tweets) {
8791
inst.setValue(pos_relations.indexOf(x.reln().toString())+OFFSET, 1);
8892
}
8993

90-
inst.setDataset(feats);
9194
feats.add(inst);
9295
}
9396

@@ -114,8 +117,7 @@ private static FastVector get_all_pos_relations(List<Tweet> tweets)
114117
GrammaticalStructureFactory gsf;
115118
GrammaticalStructure gs;
116119
Collection tdl;
117-
LexicalizedParser lp = new LexicalizedParser("res/englishPCFG.ser.gz");
118-
lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"});
120+
119121
tlp = new PennTreebankLanguagePack();
120122
gsf = tlp.grammaticalStructureFactory();
121123
int postags_count = 0;

0 commit comments

Comments
 (0)
Please sign in to comment.