-
Notifications
You must be signed in to change notification settings - Fork 20
/
terms.h
194 lines (157 loc) · 7.45 KB
/
terms.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#pragma once
#include "codecs.h"
#include <compress.h>
#include <switch_mallocators.h>
// Prefic compressed terms dictionary
// Maps from str8_t=>term_index_ctx
namespace Trinity {
// We can no longer ommit (term, term_index_ctx) from the terms data file and keep
// that just in the index, beause while it works great for lookups, it means we can't trivially iterate
// over all terms in the terms data file (see terms_data_view struct), and this is important for merging segments.
//
// For other applications that do not need to access to all terms, one couild get those structures, make sure TRINITY_TERMS_FAT_INDEX is defined
// and use it .
//#define TRINITY_TERMS_FAT_INDEX
struct terms_skiplist_entry final {
str8_t term;
#ifdef TRINITY_TERMS_FAT_INDEX
uint32_t blockOffset; // offset in the terms datafile
term_index_ctx tctx; // payload
#else
uint32_t blockOffset; // offset in the terms datafile
#endif
};
term_index_ctx lookup_term(range_base<const uint8_t *, uint32_t> termsData,
const str8_t term,
const std::vector<terms_skiplist_entry> &skipList);
void unpack_terms_skiplist(const range_base<const uint8_t *, const uint32_t> termsIndex,
std::vector<terms_skiplist_entry> * skipList,
simple_allocator & allocator);
void pack_terms(std::vector<std::pair<str8_t, term_index_ctx>> &terms,
IOBuffer *const data,
IOBuffer *const index);
// An abstract index source terms access wrapper
//
// For segments, you will likely use the prefix-compressed terms infra. but you may have
// an index source that is e.g storing all those terms in an in-memory std::unordered_map<> or whatever else
// for some reason and you can just write an IndexSourceTermsView subclass to access that.
//
// IndexSourceTermsView subclasses are used while merging index sources.
// see merge.h
struct IndexSourceTermsView {
virtual std::pair<str8_t, term_index_ctx> cur() = 0;
virtual void next() = 0;
virtual bool done() = 0;
virtual ~IndexSourceTermsView() {
}
};
// iterator access to the terms data
// this is very useful for merging terms dictionaries (see IndexSourcePrefixCompressedTermsView)
struct terms_data_view final {
public:
struct iterator final {
friend struct terms_data_view;
private:
const uint8_t * p;
// WAS: str8_t::value_type termStorage[Limits::MaxTermLength];
//
// people make mistakes; sometimes they do index terms longer than Limits::MaxTermLength
// and when decoding said terms they will override termStorage.
// we are now explicitly sizing it so that it can fit anything and thanks to
// RVO the cost shouldn't be felt by trinity applications
str8_t::value_type termStorage[128];
public:
struct
{
str8_t term;
term_index_ctx tctx;
} cur;
iterator(const uint8_t *ptr)
: p{ptr} {
cur.term.p = termStorage;
cur.term.len = 0;
}
iterator(const iterator &o) = delete;
iterator &operator=(const iterator &) = delete;
inline bool operator==(const iterator &o) const noexcept {
return p == o.p;
}
inline bool operator!=(const iterator &o) const noexcept {
return p != o.p;
}
str8_t term() noexcept {
decode_cur();
return cur.term;
}
term_index_ctx tctx() noexcept {
decode_cur();
return cur.tctx;
}
inline iterator &operator++() {
cur.term.len = 0;
return *this;
}
inline std::pair<str8_t, term_index_ctx> operator*() noexcept {
decode_cur();
return {cur.term, cur.tctx};
}
protected:
void decode_cur();
};
private:
const range_base<const uint8_t *, uint32_t> termsData;
public:
iterator begin() const {
return {termsData.start()};
}
iterator end() const {
return {termsData.stop()};
}
terms_data_view(const range_base<const uint8_t *, uint32_t> d)
: termsData{d} {
}
};
// A specialised IndexSourceTermsView for accessing prefix-encoded terms dictionaries
struct IndexSourcePrefixCompressedTermsView final
: public IndexSourceTermsView {
private:
terms_data_view::iterator it;
const terms_data_view::iterator end;
public:
IndexSourcePrefixCompressedTermsView(const range_base<const uint8_t *, uint32_t> termsData)
: it{termsData.start()}, end{termsData.stop()} {
}
std::pair<str8_t, term_index_ctx> cur() override final {
return *it;
}
void next() override final {
++it;
}
bool done() override final {
return it == end;
}
};
//A handy wrapper for memory mapped terms data and a skiplist from the terms index
class SegmentTerms final {
private:
std::vector<terms_skiplist_entry> skiplist;
simple_allocator allocator;
range_base<const uint8_t *, uint32_t> termsData;
public:
SegmentTerms(const char *segmentBasePath);
~SegmentTerms() noexcept {
if (auto ptr = (void *)(termsData.offset)) {
munmap(ptr, termsData.size());
}
}
term_index_ctx lookup(const str8_t term) {
return lookup_term(termsData, term, skiplist);
}
auto terms_data_access() const {
return terms_data_view(termsData);
}
auto new_terms_view() const {
return new IndexSourcePrefixCompressedTermsView(termsData);
}
};
} // namespace Trinity