diff --git a/internal/dic/udic.go b/internal/dic/udic.go index 7ef2c70c..55d1cd77 100644 --- a/internal/dic/udic.go +++ b/internal/dic/udic.go @@ -35,11 +35,11 @@ type UserDic struct { Contents []UserDicContent } +// UserDicColumnSize is the column size of the user dictionary. +const UserDicColumnSize = 4 + // NewUserDic build a user dictionary from a file. func NewUserDic(path string) (udic *UserDic, err error) { - const ( - userDicColumnSize = 4 - ) udic = new(UserDic) f, err := os.Open(path) if err != nil { @@ -66,7 +66,7 @@ func NewUserDic(path string) (udic *UserDic, err error) { var keys []string for _, line := range text { record := strings.Split(line, ",") - if len(record) != userDicColumnSize { + if len(record) != UserDicColumnSize { err = fmt.Errorf("invalid format: %s", line) return } diff --git a/tokenizer/dic.go b/tokenizer/dic.go index 58dea6ba..b2588112 100644 --- a/tokenizer/dic.go +++ b/tokenizer/dic.go @@ -21,11 +21,6 @@ type Dic struct { dic *dic.Dic } -// UserDic represents a user dictionary. -type UserDic struct { - dic *dic.UserDic -} - // SysDic returns the system dictionary (IPA dictionary). func SysDic() Dic { return Dic{dic.SysDic()} @@ -46,9 +41,3 @@ func NewDic(path string) (Dic, error) { d, err := dic.Load(path) return Dic{d}, err } - -// NewUserDic build a user dictionary from a file. -func NewUserDic(path string) (UserDic, error) { - d, err := dic.NewUserDic(path) - return UserDic{d}, err -} diff --git a/tokenizer/udic.go b/tokenizer/udic.go new file mode 100644 index 00000000..fe99630d --- /dev/null +++ b/tokenizer/udic.go @@ -0,0 +1,127 @@ +// Copyright 2016 ikawaha +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tokenizer + +import ( + "bufio" + "fmt" + "io" + "os" + "sort" + "strings" + + "github.com/ikawaha/kagome/internal/dic" +) + +// UserDic represents a user dictionary. +type UserDic struct { + dic *dic.UserDic +} + +// NewUserDic build a user dictionary from a file. +func NewUserDic(path string) (UserDic, error) { + f, err := os.Open(path) + if err != nil { + return UserDic{}, err + } + defer f.Close() + + r, err := NewUserDicRecords(f) + if err != nil { + return UserDic{}, err + } + return r.NewUserDic() +} + +// UserDicRecord represents a record of the user dictionary file format. +type UserDicRecord struct { + Text string `json:"text"` + Tokens []string `json:"tokens"` + Yomi []string `json:"yomi"` + Pos string `json:"pos"` +} + +// UserDicRecords represents user dictionary data. +type UserDicRecords []UserDicRecord + +func (u UserDicRecords) Len() int { return len(u) } +func (u UserDicRecords) Swap(i, j int) { u[i], u[j] = u[j], u[i] } +func (u UserDicRecords) Less(i, j int) bool { return u[i].Text < u[j].Text } + +// NewUserDicRecords loads user dictionary data from io.Reader. +func NewUserDicRecords(r io.Reader) (UserDicRecords, error) { + var text []string + scanner := bufio.NewScanner(r) + for scanner.Scan() { + line := scanner.Text() + if line == "" || strings.HasPrefix(line, "#") { + continue + } + text = append(text, line) + } + if err := scanner.Err(); err != nil { + return nil, err + } + + var records UserDicRecords + for _, line := range text { + vec := strings.Split(line, ",") + if len(vec) != dic.UserDicColumnSize { + return nil, fmt.Errorf("invalid format: %s", line) + } + tokens := strings.Split(vec[1], " ") + yomi := strings.Split(vec[2], " ") + if len(tokens) == 0 || len(tokens) != len(yomi) { + return nil, fmt.Errorf("invalid format: %s", line) + } + r := UserDicRecord{ + Text: vec[0], + Tokens: tokens, + Yomi: yomi, + Pos: vec[3], + } + records = append(records, r) + } + return records, nil +} + +// NewUserDic builds a user dictionary. +func (u UserDicRecords) NewUserDic() (UserDic, error) { + udic := new(dic.UserDic) + sort.Sort(u) + + prev := "" + keys := make([]string, 0, len(u)) + for _, r := range u { + k := strings.TrimSpace(r.Text) + if prev == k { + return UserDic{}, fmt.Errorf("duplicated error, %+v", r) + } + prev = k + keys = append(keys, k) + if len(r.Tokens) == 0 || len(r.Tokens) != len(r.Yomi) { + return UserDic{}, fmt.Errorf("invalid format, %+v", r) + } + c := dic.UserDicContent{ + Tokens: r.Tokens, + Yomi: r.Yomi, + Pos: r.Pos, + } + udic.Contents = append(udic.Contents, c) + } + idx, err := dic.BuildIndexTable(keys) + udic.Index = idx + return UserDic{dic: udic}, err +} diff --git a/tokenizer/udic_test.go b/tokenizer/udic_test.go new file mode 100644 index 00000000..483fc84c --- /dev/null +++ b/tokenizer/udic_test.go @@ -0,0 +1,189 @@ +// Copyright 2015 ikawaha +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tokenizer + +import ( + "encoding/json" + "reflect" + "strings" + "testing" +) + +var testFile = "../_sample/userdic.txt" + +func TestNewUserDic01(t *testing.T) { + if _, e := NewUserDic(""); e == nil { + t.Error("expected error, but no occured\n") + } +} + +func TestNewUserDicIndex01(t *testing.T) { + udic, e := NewUserDic(testFile) + if e != nil { + t.Fatalf("unexpected error: %v\n", e) + } + type tuple struct { + inp string + id int + ok bool + } + callAndRespose := []tuple{ + tuple{inp: "日本経済新聞", id: 0, ok: true}, + tuple{inp: "朝青龍", id: 1, ok: true}, + tuple{inp: "関西国際空港", id: 2, ok: true}, + tuple{inp: "成田国際空港", id: 9, ok: false}, + } + for _, cr := range callAndRespose { + if ids := udic.dic.Index.Search(cr.inp); (len(ids) != 0) != cr.ok { + t.Errorf("got %v, expected %v\n", ids, cr.ok) + } + } +} + +func TestNewUserDicRecords01(t *testing.T) { + r := UserDicRecords{ + { + Text: "日本経済新聞", + Tokens: []string{"日本", "経済", "新聞"}, + Yomi: []string{"ニホン", "ケイザイ", "シンブン"}, + Pos: "カスタム名詞", + }, + { + Text: "朝青龍", + Tokens: []string{"朝青龍"}, + Yomi: []string{"アサショウリュウ"}, + Pos: "カスタム人名", + }, + } + udic, err := r.NewUserDic() + if err != nil { + t.Fatalf("user dic build error, %v", err) + } + if ids := udic.dic.Index.Search("日本経済新聞"); len(ids) != 1 { + t.Errorf("user dic search failed") + } else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"}) { + t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"}) + } + if ids := udic.dic.Index.Search("関西国際空港"); len(ids) != 0 { + t.Errorf("user dic build failed") + } + if ids := udic.dic.Index.Search("朝青龍"); len(ids) == 0 { + t.Errorf("user dic search failed") + } else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"}) { + t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"}) + } +} + +func TestNewUserDicRecords02(t *testing.T) { + s := ` +日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 +# 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム地名 +朝青龍,朝青龍,アサショウリュウ,カスタム人名 +` + r := strings.NewReader(s) + rec, err := NewUserDicRecords(r) + if err != nil { + t.Fatalf("user dic build error, %v", err) + } + udic, err := rec.NewUserDic() + if err != nil { + t.Fatalf("user dic build error, %v", err) + } + if ids := udic.dic.Index.Search("日本経済新聞"); len(ids) != 1 { + t.Errorf("user dic search failed") + } else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"}) { + t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"日本", "経済", "新聞"}) + } + if ids := udic.dic.Index.Search("関西国際空港"); len(ids) != 0 { + t.Errorf("user dic build failed") + } + if ids := udic.dic.Index.Search("朝青龍"); len(ids) == 0 { + t.Errorf("user dic search failed") + } else if !reflect.DeepEqual(udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"}) { + t.Errorf("got %+v, expected %+v", udic.dic.Contents[ids[0]].Tokens, []string{"朝青龍"}) + } + +} + +func TestNewUserDicRecords03(t *testing.T) { + r := UserDicRecords{ + { + Text: "日本経済新聞", + Tokens: []string{"日本", "経済", "新聞"}, + Yomi: []string{"ニホン", "ケイザイ"}, + Pos: "カスタム名詞", + }, + } + _, err := r.NewUserDic() + if err == nil { + t.Errorf("expected error, but nil") + } +} + +func TestNewUserDicRecords04(t *testing.T) { + r := UserDicRecords{ + { + Text: "日本経済新聞", + Tokens: []string{"日本", "経済", "新聞"}, + Yomi: []string{"ニホン", "ケイザイ", "シンブン"}, + Pos: "カスタム名詞", + }, + { + Text: "日本経済新聞", + Tokens: []string{"日本", "経済", "新聞"}, + Yomi: []string{"ニホン", "ケイザイ", "シンブン"}, + Pos: "カスタム名詞", + }, + } + _, err := r.NewUserDic() + if err == nil { + t.Errorf("expected error, but nil") + } +} + +func TestUserDicRecordsLoadFromJSON(t *testing.T) { + var rec UserDicRecords + _ = json.Unmarshal([]byte(`[ + { + "text":"日本経済新聞", + "tokens":["日本","経済","新聞"], + "yomi":["ニホン","ケイザイ","シンブン"], + "pos":"カスタム名詞" + }, + { + "text":"朝青龍", + "tokens":["朝青龍"], + "yomi":["アサショウリュウ"], + "pos":"カスタム人名" + }]`), &rec) + expected := UserDicRecords{ + { + Text: "日本経済新聞", + Tokens: []string{"日本", "経済", "新聞"}, + Yomi: []string{"ニホン", "ケイザイ", "シンブン"}, + Pos: "カスタム名詞", + }, + { + Text: "朝青龍", + Tokens: []string{"朝青龍"}, + Yomi: []string{"アサショウリュウ"}, + Pos: "カスタム人名", + }, + } + + if !reflect.DeepEqual(rec, expected) { + t.Errorf("got %v, expected %v", rec, expected) + } +}