Skip to content

Commit

Permalink
Merge pull request #74 from ikawaha/feature/udic_20160318
Browse files Browse the repository at this point in the history
Add the user dictionary builder
  • Loading branch information
ikawaha committed Mar 20, 2016
2 parents d73e145 + 5e80a7c commit 053bff1
Show file tree
Hide file tree
Showing 4 changed files with 320 additions and 15 deletions.
8 changes: 4 additions & 4 deletions internal/dic/udic.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ type UserDic struct {
Contents []UserDicContent
}

// UserDicColumnSize is the column size of the user dictionary.
const UserDicColumnSize = 4

// NewUserDic build a user dictionary from a file.
func NewUserDic(path string) (udic *UserDic, err error) {
const (
userDicColumnSize = 4
)
udic = new(UserDic)
f, err := os.Open(path)
if err != nil {
Expand All @@ -66,7 +66,7 @@ func NewUserDic(path string) (udic *UserDic, err error) {
var keys []string
for _, line := range text {
record := strings.Split(line, ",")
if len(record) != userDicColumnSize {
if len(record) != UserDicColumnSize {
err = fmt.Errorf("invalid format: %s", line)
return
}
Expand Down
11 changes: 0 additions & 11 deletions tokenizer/dic.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,6 @@ type Dic struct {
dic *dic.Dic
}

// UserDic represents a user dictionary.
type UserDic struct {
dic *dic.UserDic
}

// SysDic returns the system dictionary (IPA dictionary).
func SysDic() Dic {
return Dic{dic.SysDic()}
Expand All @@ -46,9 +41,3 @@ func NewDic(path string) (Dic, error) {
d, err := dic.Load(path)
return Dic{d}, err
}

// NewUserDic build a user dictionary from a file.
func NewUserDic(path string) (UserDic, error) {
d, err := dic.NewUserDic(path)
return UserDic{d}, err
}
127 changes: 127 additions & 0 deletions tokenizer/udic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Copyright 2016 ikawaha
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tokenizer

import (
"bufio"
"fmt"
"io"
"os"
"sort"
"strings"

"github.com/ikawaha/kagome/internal/dic"
)

// UserDic represents a user dictionary.
type UserDic struct {
dic *dic.UserDic
}

// NewUserDic build a user dictionary from a file.
func NewUserDic(path string) (UserDic, error) {
f, err := os.Open(path)
if err != nil {
return UserDic{}, err
}
defer f.Close()

r, err := NewUserDicRecords(f)
if err != nil {
return UserDic{}, err
}
return r.NewUserDic()
}

// UserDicRecord represents a record of the user dictionary file format.
type UserDicRecord struct {
Text string `json:"text"`
Tokens []string `json:"tokens"`
Yomi []string `json:"yomi"`
Pos string `json:"pos"`
}

// UserDicRecords represents user dictionary data.
type UserDicRecords []UserDicRecord

func (u UserDicRecords) Len() int { return len(u) }
func (u UserDicRecords) Swap(i, j int) { u[i], u[j] = u[j], u[i] }
func (u UserDicRecords) Less(i, j int) bool { return u[i].Text < u[j].Text }

// NewUserDicRecords loads user dictionary data from io.Reader.
func NewUserDicRecords(r io.Reader) (UserDicRecords, error) {
var text []string
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := scanner.Text()
if line == "" || strings.HasPrefix(line, "#") {
continue
}
text = append(text, line)
}
if err := scanner.Err(); err != nil {
return nil, err
}

var records UserDicRecords
for _, line := range text {
vec := strings.Split(line, ",")
if len(vec) != dic.UserDicColumnSize {
return nil, fmt.Errorf("invalid format: %s", line)
}
tokens := strings.Split(vec[1], " ")
yomi := strings.Split(vec[2], " ")
if len(tokens) == 0 || len(tokens) != len(yomi) {
return nil, fmt.Errorf("invalid format: %s", line)
}
r := UserDicRecord{
Text: vec[0],
Tokens: tokens,
Yomi: yomi,
Pos: vec[3],
}
records = append(records, r)
}
return records, nil
}

// NewUserDic builds a user dictionary.
func (u UserDicRecords) NewUserDic() (UserDic, error) {
udic := new(dic.UserDic)
sort.Sort(u)

prev := ""
keys := make([]string, 0, len(u))
for _, r := range u {
k := strings.TrimSpace(r.Text)
if prev == k {
return UserDic{}, fmt.Errorf("duplicated error, %+v", r)
}
prev = k
keys = append(keys, k)
if len(r.Tokens) == 0 || len(r.Tokens) != len(r.Yomi) {
return UserDic{}, fmt.Errorf("invalid format, %+v", r)
}
c := dic.UserDicContent{
Tokens: r.Tokens,
Yomi: r.Yomi,
Pos: r.Pos,
}
udic.Contents = append(udic.Contents, c)
}
idx, err := dic.BuildIndexTable(keys)
udic.Index = idx
return UserDic{dic: udic}, err
}
189 changes: 189 additions & 0 deletions tokenizer/udic_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
// Copyright 2015 ikawaha
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tokenizer

import (
"encoding/json"
"reflect"
"strings"
"testing"
)

var testFile = "../_sample/userdic.txt"

func TestNewUserDic01(t *testing.T) {
if _, e := NewUserDic(""); e == nil {
t.Error("expected error, but no occured\n")
}
}

func TestNewUserDicIndex01(t *testing.T) {
udic, e := NewUserDic(testFile)
if e != nil {
t.Fatalf("unexpected error: %v\n", e)
}
type tuple struct {
inp string
id int
ok bool
}
callAndRespose := []tuple{
tuple{inp: "日本経済新聞", id: 0, ok: true},
tuple{inp: "朝青龍", id: 1, ok: true},
tuple{inp: "関西国際空港", id: 2, ok: true},
tuple{inp: "成田国際空港", id: 9, ok: false},
}
for _, cr := range callAndRespose {
if ids := udic.dic.Index.Search(cr.inp); (len(ids) != 0) != cr.ok {
t.Errorf("got %v, expected %v\n", ids, cr.ok)
}
}
}

func TestNewUserDicRecords01(t *testing.T) {
r := UserDicRecords{
{
Text: "日本経済新聞",
Tokens: []string{"日本", "経済", "新聞"},
Yomi: []string{"ニホン", "ケイザイ", "シンブン"},
Pos: "カスタム名詞",
},
{
Text: "朝青龍",
Tokens: []string{"朝青龍"},
Yomi: []string{"アサショウリュウ"},
Pos: "カスタム人名",
},
}
udic, err := r.NewUserDic()
if err != nil {
t.Fatalf("user dic build error, %v", err)
}
if ids := udic.dic.Index.Search("日本経済新聞"); len(ids) != 1 {
t.Errorf("user dic search failed")
} else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"}) {
t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"})
}
if ids := udic.dic.Index.Search("関西国際空港"); len(ids) != 0 {
t.Errorf("user dic build failed")
}
if ids := udic.dic.Index.Search("朝青龍"); len(ids) == 0 {
t.Errorf("user dic search failed")
} else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"}) {
t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"})
}
}

func TestNewUserDicRecords02(t *testing.T) {
s := `
日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
# 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム地名
朝青龍,朝青龍,アサショウリュウ,カスタム人名
`
r := strings.NewReader(s)
rec, err := NewUserDicRecords(r)
if err != nil {
t.Fatalf("user dic build error, %v", err)
}
udic, err := rec.NewUserDic()
if err != nil {
t.Fatalf("user dic build error, %v", err)
}
if ids := udic.dic.Index.Search("日本経済新聞"); len(ids) != 1 {
t.Errorf("user dic search failed")
} else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"}) {
t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"})
}
if ids := udic.dic.Index.Search("関西国際空港"); len(ids) != 0 {
t.Errorf("user dic build failed")
}
if ids := udic.dic.Index.Search("朝青龍"); len(ids) == 0 {
t.Errorf("user dic search failed")
} else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"}) {
t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"})
}

}

func TestNewUserDicRecords03(t *testing.T) {
r := UserDicRecords{
{
Text: "日本経済新聞",
Tokens: []string{"日本", "経済", "新聞"},
Yomi: []string{"ニホン", "ケイザイ"},
Pos: "カスタム名詞",
},
}
_, err := r.NewUserDic()
if err == nil {
t.Errorf("expected error, but nil")
}
}

func TestNewUserDicRecords04(t *testing.T) {
r := UserDicRecords{
{
Text: "日本経済新聞",
Tokens: []string{"日本", "経済", "新聞"},
Yomi: []string{"ニホン", "ケイザイ", "シンブン"},
Pos: "カスタム名詞",
},
{
Text: "日本経済新聞",
Tokens: []string{"日本", "経済", "新聞"},
Yomi: []string{"ニホン", "ケイザイ", "シンブン"},
Pos: "カスタム名詞",
},
}
_, err := r.NewUserDic()
if err == nil {
t.Errorf("expected error, but nil")
}
}

func TestUserDicRecordsLoadFromJSON(t *testing.T) {
var rec UserDicRecords
_ = json.Unmarshal([]byte(`[
{
"text":"日本経済新聞",
"tokens":["日本","経済","新聞"],
"yomi":["ニホン","ケイザイ","シンブン"],
"pos":"カスタム名詞"
},
{
"text":"朝青龍",
"tokens":["朝青龍"],
"yomi":["アサショウリュウ"],
"pos":"カスタム人名"
}]`), &rec)
expected := UserDicRecords{
{
Text: "日本経済新聞",
Tokens: []string{"日本", "経済", "新聞"},
Yomi: []string{"ニホン", "ケイザイ", "シンブン"},
Pos: "カスタム名詞",
},
{
Text: "朝青龍",
Tokens: []string{"朝青龍"},
Yomi: []string{"アサショウリュウ"},
Pos: "カスタム人名",
},
}

if !reflect.DeepEqual(rec, expected) {
t.Errorf("got %v, expected %v", rec, expected)
}
}

0 comments on commit 053bff1

Please sign in to comment.