Skip to content

Commit

Permalink
做了一些nlp分词方式的改动
Browse files Browse the repository at this point in the history
  • Loading branch information
ansjsun committed Jan 22, 2014
1 parent d6748ee commit ab5a16f
Show file tree
Hide file tree
Showing 26 changed files with 3,990 additions and 45,604 deletions.
3 changes: 1 addition & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<artifactId>ansj_seg</artifactId>
<packaging>jar</packaging>
<name>ansj_seg</name>
<version>1.1.alpha</version>
<version>1.1</version>
<description>best java chinese word seg ! </description>
<url>https://github.com/ansjsun/ansj_seg</url>
<licenses>
Expand Down Expand Up @@ -64,7 +64,6 @@
</configuration>
</plugin>


<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.7</version>
Expand Down
276 changes: 130 additions & 146 deletions src/main/java/org/ansj/dic/LearnTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import org.ansj.domain.NewWord;
import org.ansj.domain.TermNatures;
import org.ansj.recognition.AsianPersonRecognition;
import org.ansj.recognition.CompanyRecogntion;
import org.ansj.recognition.ForeignPersonRecognition;
import org.ansj.util.Graph;

Expand All @@ -22,149 +21,134 @@
*/
public class LearnTool {

/**
* 是否开启学习机
*/
public boolean isCompany = true;

public boolean isAsianName = true;

public boolean isForeignName = true;

/**
* 告诉大家你学习了多少个词了
*/
public int count;

/**
* 新词发现的结果集.可以序列化到硬盘.然后可以当做训练集来做.
*/
private final SmartForest<NewWord> sf = new SmartForest<NewWord>();

/**
* 公司名称学习.
*
* @param graph
*/
public void learn(Graph graph) {

// 亚洲人名识别
if (isAsianName) {
findAsianPerson(graph);
}

// 外国人名识别
if (isForeignName) {
findForeignPerson(graph);
}

}

private void findAsianPerson(Graph graph) {
List<NewWord> newWords = new AsianPersonRecognition(graph.terms).getNewWords();
addListToTerm(newWords);
}

private void findForeignPerson(Graph graph) {
List<NewWord> newWords = new ForeignPersonRecognition(graph.terms).getNewWords();
addListToTerm(newWords);
}

/**
* 公司名称查找
*
* @param graph
*/
private void findCompany(Graph graph) {
List<NewWord> newWords = new CompanyRecogntion(graph.terms).getNewWords();
addListToTerm(newWords);
}

// 批量将新词加入到词典中
private void addListToTerm(List<NewWord> newWords) {
if (newWords.size() == 0)
return;

for (NewWord newWord : newWords) {
newWord.setScore(-1);
addTerm(newWord);
}
}


/**
* 增加一个新词到树中
*
* @param newWord
*/
public void addTerm(NewWord newWord) {
NewWord temp = null;
SmartForest<NewWord> smartForest = null;
if ((smartForest = sf.getBranch(newWord.getName())) != null
&& smartForest.getParam() != null) {
temp = smartForest.getParam();
temp.update(newWord.getScore(), newWord.getNature(), newWord.getAllFreq());
} else {
count++;
// 设置名字为空,节省内存空间
synchronized (sf) {
sf.add(newWord.getName(), newWord);
}
}
}

public SmartForest<NewWord> getForest() {
return this.sf;
}

/**
* 返回学习到的新词.
*
* @param num
* 返回数目.0为全部返回
* @return
*/
public List<Entry<String, Double>> getTopTree(int num) {
return getTopTree(num, null);
}

public List<Entry<String, Double>> getTopTree(int num, TermNatures nature) {
if (sf.branches == null) {
return null;
}
HashMap<String, Double> hm = new HashMap<String, Double>();
for (int i = 0; i < sf.branches.length; i++) {
valueResult(sf.branches[i], hm, nature);
}
List<Entry<String, Double>> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1);
if (num == 0) {
return sortMapByValue;
} else {
num = Math.min(num, sortMapByValue.size());
return sortMapByValue.subList(0, num);
}
}

private void valueResult(SmartForest<NewWord> smartForest, HashMap<String, Double> hm,
TermNatures nature) {
// TODO Auto-generated method stub
if (smartForest == null || smartForest.branches==null) {
return ;
}
for (int i = 0; i < smartForest.branches.length; i++) {
NewWord param = smartForest.branches[i].getParam();
if (smartForest.branches[i].getStatus() == 3) {
if (nature == null || param.getNature().equals(nature)) {
hm.put(param.getName(), param.getScore());
}
} else if (smartForest.branches[i].getStatus() == 2) {
if (nature == null || param.getNature().equals(nature)) {
hm.put(param.getName(), param.getScore());
}
valueResult(smartForest.branches[i], hm, nature);
} else {
valueResult(smartForest.branches[i], hm, nature);
}
}
}
/**
* 是否开启学习机
*/
public boolean isAsianName = true;

public boolean isForeignName = true;

/**
* 告诉大家你学习了多少个词了
*/
public int count;

/**
* 新词发现的结果集.可以序列化到硬盘.然后可以当做训练集来做.
*/
private final SmartForest<NewWord> sf = new SmartForest<NewWord>();

/**
* 公司名称学习.
*
* @param graph
*/
public void learn(Graph graph) {

// 亚洲人名识别
if (isAsianName) {
findAsianPerson(graph);
}

// 外国人名识别
if (isForeignName) {
findForeignPerson(graph);
}

}

private void findAsianPerson(Graph graph) {
List<NewWord> newWords = new AsianPersonRecognition(graph.terms).getNewWords();
addListToTerm(newWords);
}

private void findForeignPerson(Graph graph) {
List<NewWord> newWords = new ForeignPersonRecognition(graph.terms).getNewWords();
addListToTerm(newWords);
}

// 批量将新词加入到词典中
private void addListToTerm(List<NewWord> newWords) {
if (newWords.size() == 0)
return;

for (NewWord newWord : newWords) {
newWord.setScore(-1);
addTerm(newWord);
}
}

/**
* 增加一个新词到树中
*
* @param newWord
*/
public void addTerm(NewWord newWord) {
NewWord temp = null;
SmartForest<NewWord> smartForest = null;
if ((smartForest = sf.getBranch(newWord.getName())) != null && smartForest.getParam() != null) {
temp = smartForest.getParam();
temp.update(newWord.getScore(), newWord.getNature(), newWord.getAllFreq());
} else {
count++;
// 设置名字为空,节省内存空间
synchronized (sf) {
sf.add(newWord.getName(), newWord);
}
}
}

public SmartForest<NewWord> getForest() {
return this.sf;
}

/**
* 返回学习到的新词.
*
* @param num
* 返回数目.0为全部返回
* @return
*/
public List<Entry<String, Double>> getTopTree(int num) {
return getTopTree(num, null);
}

public List<Entry<String, Double>> getTopTree(int num, TermNatures nature) {
if (sf.branches == null) {
return null;
}
HashMap<String, Double> hm = new HashMap<String, Double>();
for (int i = 0; i < sf.branches.length; i++) {
valueResult(sf.branches[i], hm, nature);
}
List<Entry<String, Double>> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1);
if (num == 0) {
return sortMapByValue;
} else {
num = Math.min(num, sortMapByValue.size());
return sortMapByValue.subList(0, num);
}
}

private void valueResult(SmartForest<NewWord> smartForest, HashMap<String, Double> hm, TermNatures nature) {
// TODO Auto-generated method stub
if (smartForest == null || smartForest.branches == null) {
return;
}
for (int i = 0; i < smartForest.branches.length; i++) {
NewWord param = smartForest.branches[i].getParam();
if (smartForest.branches[i].getStatus() == 3) {
if (nature == null || param.getNature().equals(nature)) {
hm.put(param.getName(), param.getScore());
}
} else if (smartForest.branches[i].getStatus() == 2) {
if (nature == null || param.getNature().equals(nature)) {
hm.put(param.getName(), param.getScore());
}
valueResult(smartForest.branches[i], hm, nature);
} else {
valueResult(smartForest.branches[i], hm, nature);
}
}
}
}
68 changes: 0 additions & 68 deletions src/main/java/org/ansj/domain/CompanyNatureAttr.java

This file was deleted.

Loading

0 comments on commit ab5a16f

Please sign in to comment.