利用mmSeg4j分词实现网页文本倾向性分析
最近一直在做网页情感倾向性分析的工作,找了一些论文,发现基于机器学习的算法在项目中不太合适,于是自己鼓捣了一套基于中文分词和正负面词库的分析算法。
原理很简单:
文章倾向性 = ∑(出现的正面词汇 * 权重) —∑(出现的负面词汇 * 权重)。
在这个基础上对于负面新闻再加上相关性判断。
在中文分词方面选择了mmSeg4j,没别的原因,就是之前一直用这个,相对来说性能非常不错,但有些词汇需要自己添加到他的words.dic文件中。mmSeg4j下载地址:http://code.google.com/p/mmseg4j/。
在正式编码之前规划了3个文本文件:
- neg_words: 配置负面词汇,每个词一行,格式为“太差-1”。“-”后面的数字作为负面词汇的权重。
- pos_words:配置的正面词汇,配置方式与负面词汇类似。
- rel_words: 相关词汇表,每行一个词即可,增加这个配置文件是为了识别出于特定内容相关的文本情感。如:仅关心近期与“万科”有关的分析。
在工程启动时将这三个文件加载到一个对象中(单例)代码如下:
- import java.io.BufferedReader;
-
import java.io.FileNotFoundException;
-
import java.io.FileReader;
-
import java.io.IOException;
-
import java.util.ArrayList;
-
import java.util.HashMap;
-
import java.util.List;
-
import java.util.Map;
-
-
import org.springframework.stereotype.Component;
-
-
import com.yidatec.vis.psms.commons.PSMSConstants;
-
-
-
-
-
-
-
@Component
-
public class TrendencyWordsLoader {
-
-
private Map<String, Integer> negWordMap;
-
private Map<String, Integer> posWordMap;
-
private List<String> refWordList;
-
-
public TrendencyWordsLoader(){
- loadWords();
- }
-
-
private void loadWords(){
-
-
negWordMap = new HashMap<String, Integer>();
-
posWordMap = new HashMap<String, Integer>();
-
refWordList = new ArrayList<String>();
-
-
try {
-
-
FileReader fr = new FileReader(this.getClass().getClassLoader().getResource(PSMSConstants.NEG_WORDS_PATH).getFile());
-
BufferedReader br = new BufferedReader(fr);
-
-
String line = null;
-
-
while((line = br.readLine()) != null){
-
-
String[] words = line.split("-");
-
negWordMap.put(words[0], Integer.parseInt(words[1]));
- }
-
-
fr = new FileReader(this.getClass().getClassLoader().getResource(PSMSConstants.POS_WORDS_PATH).getFile());
-
br = new BufferedReader(fr);
-
line = null;
-
-
while((line = br.readLine()) != null){
-
-
String[] words = line.split("-");
-
posWordMap.put(words[0], Integer.parseInt(words[1]));
- }
-
-
fr = new FileReader(this.getClass().getClassLoader().getResource(PSMSConstants.REL_WORDS_PATH).getFile());
-
br = new BufferedReader(fr);
-
line = null;
-
-
while((line = br.readLine()) != null){
- refWordList.add(line);
- }
-
- br.close();
- fr.close();
-
-
} catch (FileNotFoundException e) {
- e.printStackTrace();
-
} catch (NumberFormatException e) {
- e.printStackTrace();
-
} catch (IOException e) {
- e.printStackTrace();
- }
- }
-
-
public Map<String, Integer> getNegWordMap() {
-
return negWordMap;
- }
-
-
public Map<String, Integer> getPosWordMap() {
-
return posWordMap;
- }
-
-
public List<String> getRefWordList() {
-
return refWordList;
- }
- }
加载词汇表后,就可以使用mmSeg4j对网页文本进行分词,并进行分析了,代码如下:
- import java.io.IOException;
-
import java.io.Reader;
-
import java.io.StringReader;
-
import java.util.ArrayList;
-
import java.util.HashMap;
-
import java.util.List;
-
import java.util.Map;
-
import java.util.Set;
-
-
import org.springframework.beans.factory.annotation.Autowired;
-
import org.springframework.stereotype.Component;
-
-
import com.chenlb.mmseg4j.ComplexSeg;
-
import com.chenlb.mmseg4j.Dictionary;
-
import com.chenlb.mmseg4j.MMSeg;
-
import com.chenlb.mmseg4j.Word;
-
import com.yidatec.vis.psms.entity.SolrQueryResult;
-
-
@Component
-
public class TrendencyAnalyser {
-
-
@Autowired
- TrendencyWordsLoader wordLoader;
-
-
protected static final Dictionary dic = Dictionary.getInstance();
-
protected static final ComplexSeg seg = new ComplexSeg(dic);
-
-
-
-
-
private final int PS_THRESHOLD = 50;
-
-
-
-
-
private final int NS_THRESHOLD = 30;
-
-
-
-
-
private Map<String, List<Word>> segments = null;
-
private List<Word> negs = null;
-
private List<Word> poses = null;
-
private List<Word> rels = null;
-
-
public int analyzeTrendency(String title, String content) {
-
-
try {
-
-
boolean flag = isRelTitle(title);
-
-
if (flag) {
-
-
int titleTendency = getTitleTrendency();
-
-
if (titleTendency < 0) {
-
return SolrQueryResult.NEGATIVE_NATURE;
-
} else if (titleTendency > 0) {
-
return SolrQueryResult.POSITIVE_NATURE;
- }
- }
-
- clearAll();
-
-
initSegmentsMap(new StringReader(title + " " + content));
-
- parseNegWordsMap();
-
- parsePosWordsMap();
-
-
int result = analyzeContentsTrendency();
-
-
if (flag) {
-
-
if (result < 0) {
-
-
return SolrQueryResult.NEGATIVE_NATURE;
-
-
} else if (result == 0) {
-
-
return SolrQueryResult.NEUTRAL_NATURE;
-
-
} else {
-
-
return SolrQueryResult.POSITIVE_NATURE;
-
- }
-
-
} else {
-
- parseRelWordsMap();
-
-
if (result < 0) {
-
-
if (analyzeTrendencyByMatrix()) {
-
-
return SolrQueryResult.NEGATIVE_NATURE;
-
-
} else {
-
-
return SolrQueryResult.NEUTRAL_NATURE;
-
- }
-
-
} else if (result == 0) {
-
-
return SolrQueryResult.NEUTRAL_NATURE;
-
-
} else {
-
-
return SolrQueryResult.POSITIVE_NATURE;
-
- }
-
- }
-
-
} catch (IOException e) {
-
return SolrQueryResult.NEUTRAL_NATURE;
- }
- }
-
-
private void clearAll() {
-
-
if (segments != null) {
- segments.clear();
- }
-
if (negs != null) {
- negs.clear();
- }
-
if (poses != null) {
- poses.clear();
- }
- }
-
-
-
-
-
-
-
-
private boolean isRelTitle(String title) {
-
-
try {
-
-
initTitleSegmentsMap(new StringReader(title));
-
- List<String> relWords = wordLoader.getRefWordList();
-
-
for (String word : relWords) {
-
-
if (segments.containsKey(word)) {
-
return true;
- }
-
- }
-
-
} catch (IOException e) {
-
return false;
- }
-
-
return false;
-
- }
-
-
-
-
-
-
-
-
private int getTitleTrendency() {
-
- parseNegWordsMap();
- parsePosWordsMap();
-
-
return analyzeContentsTrendency();
-
- }
-
-
-
-
-
-
-
-
-
private int analyzeContentsTrendency() {
-
-
int negScore = 0;
-
int posScore = 0;
-
-
if (negs != null && negs.size() > 0) {
-
-
for (Word word : negs) {
- negScore += wordLoader.getNegWordMap().get(word.getString());
- }
-
- }
-
-
if (poses != null && poses.size() > 0) {
-
-
for (Word word : poses) {
- posScore += wordLoader.getPosWordMap().get(word.getString());
- }
- }
-
-
return posScore - negScore;
- }
-
-
-
-
-
-
-
private boolean analyzeTrendencyByMatrix() {
-
-
if (rels == null || rels.size() == 0) {
-
return false;
- }
-
-
if (negs == null || negs.size() == 0) {
-
return false;
- }
-
-
for (int i = 0; i < rels.size(); i++) {
-
-
for (int j = 0; j < negs.size(); j++) {
-
- Word relWord = rels.get(i);
- Word negWord = negs.get(j);
-
-
if (relWord.getStartOffset() < negWord.getStartOffset()) {
-
-
if (negWord.getStartOffset() - relWord.getStartOffset()
- - relWord.getLength() < PS_THRESHOLD) {
-
-
return true;
-
- }
-
-
} else {
-
if (relWord.getStartOffset() - negWord.getStartOffset()
- - negWord.getLength() < NS_THRESHOLD) {
-
return true;
- }
- }
-
- }
-
- }
-
-
return false;
-
- }
-
-
-
-
-
-
-
-
private void initTitleSegmentsMap(Reader reader) throws IOException {
-
-
segments = new HashMap<String, List<Word>>();
-
-
MMSeg mmSeg = new MMSeg(reader, seg);
-
-
Word word = null;
-
-
while ((word = mmSeg.next()) != null) {
-
-
if (segments.containsKey(word.getString())) {
-
- segments.get(word.getString()).add(word);
- }
-
-
List<Word> words = new ArrayList<Word>();
-
- words.add(word);
-
- segments.put(word.getString(), words);
-
- }
- }
-
-
-
-
-
-
-
-
private void initSegmentsMap(Reader reader) throws IOException {
-
-
if (segments == null) {
-
segments = new HashMap<String, List<Word>>();
- }
-
-
MMSeg mmSeg = new MMSeg(reader, seg);
-
-
Word word = null;
-
-
while ((word = mmSeg.next()) != null) {
-
-
if (segments.containsKey(word.getString())) {
-
- segments.get(word.getString()).add(word);
- }
-
-
List<Word> words = new ArrayList<Word>();
-
- words.add(word);
-
- segments.put(word.getString(), words);
-
- }
-
- }
-
-
-
-
-
private void parseNegWordsMap() {
-
- Map<String, Integer> negMap = wordLoader.getNegWordMap();
- Set<String> negKeys = negMap.keySet();
-
-
for (String negKey : negKeys) {
-
- List<Word> negWords = segments.get(negKey);
-
-
if (negWords != null) {
-
-
if (negs == null) {
-
negs = new ArrayList<Word>();
- }
-
- negs.addAll(negWords);
-
- }
-
- }
-
- }
-
-
-
-
-
private void parsePosWordsMap() {
-
- Map<String, Integer> posMap = wordLoader.getPosWordMap();
- Set<String> posKeys = posMap.keySet();
-
-
for (String posKey : posKeys) {
-
- List<Word> posWords = segments.get(posKey);
-
-
if (posWords != null) {
-
-
if (poses == null) {
-
poses = new ArrayList<Word>();
- }
-
- poses.addAll(posWords);
-
- }
-
- }
- }
-
-
-
-
-
private void parseRelWordsMap() {
-
- List<String> refWords = wordLoader.getRefWordList();
-
-
for (String word : refWords) {
-
- List<Word> relWords = segments.get(word);
-
-
if (relWords != null) {
-
-
if (rels == null) {
-
rels = new ArrayList<Word>();
- }
-
- rels.addAll(relWords);
-
- }
- }
-
- }
-
- }
这里面用了一些策略:
- 先分析标题,如果标题中出现相关词汇,仅需判断正文倾向性即可。
- 如果标题中出现相关词汇,并且标题存在倾向,以标题倾向为准。
- 如果上述都不成立,则合并标题与正文,一起进行分词与情感词汇识别。
- 对于通篇识别为负面情感的文章需要进一步判断相关性。
- 采用距离矩阵的方式判断相关性。
- 需要设定正向最大距离阈值与反向最大距离阈值。
本文转自william_xu 51CTO博客,原文链接:http://blog.51cto.com/williamx/863110,如需转载请自行联系原作者