From e8a2fe8b1cd5b2bb6d0d6e6a6ca9253ee38ffc61 Mon Sep 17 00:00:00 2001 From: ikawaha Date: Sat, 19 Mar 2016 15:18:51 +0900 Subject: [PATCH 1/4] Add the user dictionary builder --- internal/dic/udic.go | 8 +-- tokenizer/dic.go | 11 --- tokenizer/udic.go | 123 +++++++++++++++++++++++++++++++++ tokenizer/udic_test.go | 153 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 280 insertions(+), 15 deletions(-) create mode 100644 tokenizer/udic.go create mode 100644 tokenizer/udic_test.go diff --git a/internal/dic/udic.go b/internal/dic/udic.go index 7ef2c70c..55d1cd77 100644 --- a/internal/dic/udic.go +++ b/internal/dic/udic.go @@ -35,11 +35,11 @@ type UserDic struct { Contents []UserDicContent } +// UserDicColumnSize is the column size of the user dictionary. +const UserDicColumnSize = 4 + // NewUserDic build a user dictionary from a file. func NewUserDic(path string) (udic *UserDic, err error) { - const ( - userDicColumnSize = 4 - ) udic = new(UserDic) f, err := os.Open(path) if err != nil { @@ -66,7 +66,7 @@ func NewUserDic(path string) (udic *UserDic, err error) { var keys []string for _, line := range text { record := strings.Split(line, ",") - if len(record) != userDicColumnSize { + if len(record) != UserDicColumnSize { err = fmt.Errorf("invalid format: %s", line) return } diff --git a/tokenizer/dic.go b/tokenizer/dic.go index 58dea6ba..b2588112 100644 --- a/tokenizer/dic.go +++ b/tokenizer/dic.go @@ -21,11 +21,6 @@ type Dic struct { dic *dic.Dic } -// UserDic represents a user dictionary. -type UserDic struct { - dic *dic.UserDic -} - // SysDic returns the system dictionary (IPA dictionary). func SysDic() Dic { return Dic{dic.SysDic()} @@ -46,9 +41,3 @@ func NewDic(path string) (Dic, error) { d, err := dic.Load(path) return Dic{d}, err } - -// NewUserDic build a user dictionary from a file. -func NewUserDic(path string) (UserDic, error) { - d, err := dic.NewUserDic(path) - return UserDic{d}, err -} diff --git a/tokenizer/udic.go b/tokenizer/udic.go new file mode 100644 index 00000000..95256e42 --- /dev/null +++ b/tokenizer/udic.go @@ -0,0 +1,123 @@ +// Copyright 2016 ikawaha +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tokenizer + +import ( + "bufio" + "fmt" + "io" + "os" + "sort" + "strings" + + "github.com/ikawaha/kagome/internal/dic" +) + +// UserDic represents a user dictionary. +type UserDic struct { + dic *dic.UserDic +} + +// NewUserDic build a user dictionary from a file. +func NewUserDic(path string) (UserDic, error) { + f, err := os.Open(path) + if err != nil { + return UserDic{}, err + } + defer f.Close() + + r, err := NewUserDicRecords(f) + if err != nil { + return UserDic{}, err + } + return r.NewUserDic() +} + +type UserDicRecord struct { + Text string `json:"text"` + Tokens []string `json:"tokens"` + Yomi []string `json:"yomi"` + Pos string `json:"pos"` +} + +type UserDicRecords []UserDicRecord + +func (u UserDicRecords) Len() int { return len(u) } +func (u UserDicRecords) Swap(i, j int) { u[i], u[j] = u[j], u[i] } +func (u UserDicRecords) Less(i, j int) bool { return u[i].Text < u[j].Text } + +func NewUserDicRecords(r io.Reader) (UserDicRecords, error) { + var text []string + scanner := bufio.NewScanner(r) + for scanner.Scan() { + line := scanner.Text() + if line == "" || strings.HasPrefix(line, "#") { + continue + } + text = append(text, line) + } + if err := scanner.Err(); err != nil { + return nil, err + } + + var records UserDicRecords + for _, line := range text { + vec := strings.Split(line, ",") + if len(vec) != dic.UserDicColumnSize { + return nil, fmt.Errorf("invalid format: %s", line) + } + tokens := strings.Split(vec[1], " ") + yomi := strings.Split(vec[2], " ") + if len(tokens) == 0 || len(tokens) != len(yomi) { + return nil, fmt.Errorf("invalid format: %s", line) + } + r := UserDicRecord{ + Text: vec[0], + Tokens: tokens, + Yomi: yomi, + Pos: vec[3], + } + records = append(records, r) + } + return records, nil +} + +func (u UserDicRecords) NewUserDic() (UserDic, error) { + udic := new(dic.UserDic) + sort.Sort(u) + + prev := "" + keys := make([]string, 0, len(u)) + for _, r := range u { + k := strings.TrimSpace(r.Text) + if prev == k { + continue + } + prev = k + keys = append(keys, k) + if len(r.Tokens) == 0 || len(r.Tokens) != len(r.Yomi) { + return UserDic{}, fmt.Errorf("invalid format: %s", r) + } + c := dic.UserDicContent{ + Tokens: r.Tokens, + Yomi: r.Yomi, + Pos: r.Pos, + } + udic.Contents = append(udic.Contents, c) + } + idx, err := dic.BuildIndexTable(keys) + udic.Index = idx + return UserDic{dic: udic}, err +} diff --git a/tokenizer/udic_test.go b/tokenizer/udic_test.go new file mode 100644 index 00000000..7d55543e --- /dev/null +++ b/tokenizer/udic_test.go @@ -0,0 +1,153 @@ +// Copyright 2015 ikawaha +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tokenizer + +import ( + "encoding/json" + "reflect" + "strings" + "testing" +) + +var testFile = "../_sample/userdic.txt" + +func TestNewUserDic01(t *testing.T) { + if _, e := NewUserDic(""); e == nil { + t.Error("expected error, but no occured\n") + } +} + +func TestNewUserDicIndex01(t *testing.T) { + udic, e := NewUserDic(testFile) + if e != nil { + t.Fatalf("unexpected error: %v\n", e) + } + type tuple struct { + inp string + id int + ok bool + } + callAndRespose := []tuple{ + tuple{inp: "日本経済新聞", id: 0, ok: true}, + tuple{inp: "朝青龍", id: 1, ok: true}, + tuple{inp: "関西国際空港", id: 2, ok: true}, + tuple{inp: "成田国際空港", id: 9, ok: false}, + } + for _, cr := range callAndRespose { + if ids := udic.dic.Index.Search(cr.inp); (len(ids) != 0) != cr.ok { + t.Errorf("got %v, expected %v\n", ids, cr.ok) + } + } +} + +func TestNewUserDicRecords01(t *testing.T) { + r := UserDicRecords{ + { + Text: "日本経済新聞", + Tokens: []string{"日本", "経済", "新聞"}, + Yomi: []string{"ニホン", "ケイザイ", "シンブン"}, + Pos: "カスタム名詞", + }, + { + Text: "朝青龍", + Tokens: []string{"朝青龍"}, + Yomi: []string{"アサショウリュウ"}, + Pos: "カスタム人名", + }, + } + udic, err := r.NewUserDic() + if err != nil { + t.Fatalf("user dic build error, %v", err) + } + if ids := udic.dic.Index.Search("日本経済新聞"); len(ids) != 1 { + t.Errorf("user dic search failed") + } else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"}) { + t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"}) + } + if ids := udic.dic.Index.Search("関西国際空港"); len(ids) != 0 { + t.Errorf("user dic build failed") + } + if ids := udic.dic.Index.Search("朝青龍"); len(ids) == 0 { + t.Errorf("user dic search failed") + } else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"}) { + t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"}) + } +} + +func TestNewUserDicRecords02(t *testing.T) { + s := ` +日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 +# 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム地名 +朝青龍,朝青龍,アサショウリュウ,カスタム人名 +` + r := strings.NewReader(s) + rec, err := NewUserDicRecords(r) + if err != nil { + t.Fatalf("user dic build error, %v", err) + } + udic, err := rec.NewUserDic() + if err != nil { + t.Fatalf("user dic build error, %v", err) + } + if ids := udic.dic.Index.Search("日本経済新聞"); len(ids) != 1 { + t.Errorf("user dic search failed") + } else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"}) { + t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"}) + } + if ids := udic.dic.Index.Search("関西国際空港"); len(ids) != 0 { + t.Errorf("user dic build failed") + } + if ids := udic.dic.Index.Search("朝青龍"); len(ids) == 0 { + t.Errorf("user dic search failed") + } else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"}) { + t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"}) + } + +} + +func TestUserDicRecordsLoadFromJSON(t *testing.T) { + var rec UserDicRecords + json.Unmarshal([]byte(`[ + { + "text":"日本経済新聞", + "tokens":["日本","経済","新聞"], + "yomi":["ニホン","ケイザイ","シンブン"], + "pos":"カスタム名詞" + }, + { + "text":"朝青龍", + "tokens":["朝青龍"], + "yomi":["アサショウリュウ"], + "pos":"カスタム人名" + }]`), &rec) + expected := UserDicRecords{ + { + Text: "日本経済新聞", + Tokens: []string{"日本", "経済", "新聞"}, + Yomi: []string{"ニホン", "ケイザイ", "シンブン"}, + Pos: "カスタム名詞", + }, + { + Text: "朝青龍", + Tokens: []string{"朝青龍"}, + Yomi: []string{"アサショウリュウ"}, + Pos: "カスタム人名", + }, + } + + if !reflect.DeepEqual(rec, expected) { + t.Errorf("got %v, expected %v", rec, expected) + } +} From e5a288e323413e9e22177b2f3acfb416823bae58 Mon Sep 17 00:00:00 2001 From: ikawaha Date: Sat, 19 Mar 2016 15:34:51 +0900 Subject: [PATCH 2/4] Fix golint --- tokenizer/udic.go | 4 ++++ tokenizer/udic_test.go | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tokenizer/udic.go b/tokenizer/udic.go index 95256e42..503a9c34 100644 --- a/tokenizer/udic.go +++ b/tokenizer/udic.go @@ -45,6 +45,7 @@ func NewUserDic(path string) (UserDic, error) { return r.NewUserDic() } +// UserDicRecord represents a record of the user dictionary file format. type UserDicRecord struct { Text string `json:"text"` Tokens []string `json:"tokens"` @@ -52,12 +53,14 @@ type UserDicRecord struct { Pos string `json:"pos"` } +// UserDicRecords represents user dictionary data. type UserDicRecords []UserDicRecord func (u UserDicRecords) Len() int { return len(u) } func (u UserDicRecords) Swap(i, j int) { u[i], u[j] = u[j], u[i] } func (u UserDicRecords) Less(i, j int) bool { return u[i].Text < u[j].Text } +// NewUserDicRecords loads user dictionary data from io.Reader. func NewUserDicRecords(r io.Reader) (UserDicRecords, error) { var text []string scanner := bufio.NewScanner(r) @@ -94,6 +97,7 @@ func NewUserDicRecords(r io.Reader) (UserDicRecords, error) { return records, nil } +// NewUserDic builds a user dictionary. func (u UserDicRecords) NewUserDic() (UserDic, error) { udic := new(dic.UserDic) sort.Sort(u) diff --git a/tokenizer/udic_test.go b/tokenizer/udic_test.go index 7d55543e..725ad233 100644 --- a/tokenizer/udic_test.go +++ b/tokenizer/udic_test.go @@ -119,7 +119,7 @@ func TestNewUserDicRecords02(t *testing.T) { func TestUserDicRecordsLoadFromJSON(t *testing.T) { var rec UserDicRecords - json.Unmarshal([]byte(`[ + _ = json.Unmarshal([]byte(`[ { "text":"日本経済新聞", "tokens":["日本","経済","新聞"], From f9542e929805796ba7c335b289a23892b13494ee Mon Sep 17 00:00:00 2001 From: ikawaha Date: Sat, 19 Mar 2016 18:05:40 +0900 Subject: [PATCH 3/4] Fix error checks --- tokenizer/udic.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tokenizer/udic.go b/tokenizer/udic.go index 503a9c34..fe99630d 100644 --- a/tokenizer/udic.go +++ b/tokenizer/udic.go @@ -107,12 +107,12 @@ func (u UserDicRecords) NewUserDic() (UserDic, error) { for _, r := range u { k := strings.TrimSpace(r.Text) if prev == k { - continue + return UserDic{}, fmt.Errorf("duplicated error, %+v", r) } prev = k keys = append(keys, k) if len(r.Tokens) == 0 || len(r.Tokens) != len(r.Yomi) { - return UserDic{}, fmt.Errorf("invalid format: %s", r) + return UserDic{}, fmt.Errorf("invalid format, %+v", r) } c := dic.UserDicContent{ Tokens: r.Tokens, From 5e80a7c17b6b0bb5331e47122cf5ea9c8375cca8 Mon Sep 17 00:00:00 2001 From: ikawaha Date: Sat, 19 Mar 2016 18:06:03 +0900 Subject: [PATCH 4/4] Add tests --- tokenizer/udic_test.go | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tokenizer/udic_test.go b/tokenizer/udic_test.go index 725ad233..483fc84c 100644 --- a/tokenizer/udic_test.go +++ b/tokenizer/udic_test.go @@ -117,6 +117,42 @@ func TestNewUserDicRecords02(t *testing.T) { } +func TestNewUserDicRecords03(t *testing.T) { + r := UserDicRecords{ + { + Text: "日本経済新聞", + Tokens: []string{"日本", "経済", "新聞"}, + Yomi: []string{"ニホン", "ケイザイ"}, + Pos: "カスタム名詞", + }, + } + _, err := r.NewUserDic() + if err == nil { + t.Errorf("expected error, but nil") + } +} + +func TestNewUserDicRecords04(t *testing.T) { + r := UserDicRecords{ + { + Text: "日本経済新聞", + Tokens: []string{"日本", "経済", "新聞"}, + Yomi: []string{"ニホン", "ケイザイ", "シンブン"}, + Pos: "カスタム名詞", + }, + { + Text: "日本経済新聞", + Tokens: []string{"日本", "経済", "新聞"}, + Yomi: []string{"ニホン", "ケイザイ", "シンブン"}, + Pos: "カスタム名詞", + }, + } + _, err := r.NewUserDic() + if err == nil { + t.Errorf("expected error, but nil") + } +} + func TestUserDicRecordsLoadFromJSON(t *testing.T) { var rec UserDicRecords _ = json.Unmarshal([]byte(`[