385 lines
14 KiB
C++
385 lines
14 KiB
C++
#ifndef MARISA_TRIE_H_
|
|
#define MARISA_TRIE_H_
|
|
|
|
#include "base.h"
|
|
|
|
#ifdef __cplusplus
|
|
|
|
#include <memory>
|
|
#include <vector>
|
|
|
|
#include "progress.h"
|
|
#include "key.h"
|
|
#include "query.h"
|
|
#include "container.h"
|
|
#include "intvector.h"
|
|
#include "bitvector.h"
|
|
#include "tail.h"
|
|
|
|
namespace marisa {
|
|
|
|
class Trie {
|
|
public:
|
|
Trie();
|
|
|
|
void build(const char * const *keys, std::size_t num_keys,
|
|
const std::size_t *key_lengths = NULL,
|
|
const double *key_weights = NULL,
|
|
UInt32 *key_ids = NULL, int flags = 0);
|
|
|
|
void build(const std::vector<std::string> &keys,
|
|
std::vector<UInt32> *key_ids = NULL, int flags = 0);
|
|
void build(const std::vector<std::pair<std::string, double> > &keys,
|
|
std::vector<UInt32> *key_ids = NULL, int flags = 0);
|
|
|
|
void mmap(Mapper *mapper, const char *filename,
|
|
long offset = 0, int whence = SEEK_SET);
|
|
void map(const void *ptr, std::size_t size);
|
|
void map(Mapper &mapper);
|
|
|
|
void load(const char *filename,
|
|
long offset = 0, int whence = SEEK_SET);
|
|
void fread(std::FILE *file);
|
|
void read(int fd);
|
|
void read(std::istream &stream);
|
|
void read(Reader &reader);
|
|
|
|
void save(const char *filename, bool trunc_flag = true,
|
|
long offset = 0, int whence = SEEK_SET) const;
|
|
void fwrite(std::FILE *file) const;
|
|
void write(int fd) const;
|
|
void write(std::ostream &stream) const;
|
|
void write(Writer &writer) const;
|
|
|
|
std::string operator[](UInt32 key_id) const;
|
|
|
|
UInt32 operator[](const char *str) const;
|
|
UInt32 operator[](const std::string &str) const;
|
|
|
|
std::string restore(UInt32 key_id) const;
|
|
void restore(UInt32 key_id, std::string *key) const;
|
|
std::size_t restore(UInt32 key_id, char *key_buf,
|
|
std::size_t key_buf_size) const;
|
|
|
|
UInt32 lookup(const char *str) const;
|
|
UInt32 lookup(const char *ptr, std::size_t length) const;
|
|
UInt32 lookup(const std::string &str) const;
|
|
|
|
std::size_t find(const char *str,
|
|
UInt32 *key_ids, std::size_t *key_lengths,
|
|
std::size_t max_num_results) const;
|
|
std::size_t find(const char *ptr, std::size_t length,
|
|
UInt32 *key_ids, std::size_t *key_lengths,
|
|
std::size_t max_num_results) const;
|
|
std::size_t find(const std::string &str,
|
|
UInt32 *key_ids, std::size_t *key_lengths,
|
|
std::size_t max_num_results) const;
|
|
|
|
std::size_t find(const char *str,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::size_t> *key_lengths = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
std::size_t find(const char *ptr, std::size_t length,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::size_t> *key_lengths = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
std::size_t find(const std::string &str,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::size_t> *key_lengths = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
|
|
UInt32 find_first(const char *str,
|
|
std::size_t *key_length = NULL) const;
|
|
UInt32 find_first(const char *ptr, std::size_t length,
|
|
std::size_t *key_length = NULL) const;
|
|
UInt32 find_first(const std::string &str,
|
|
std::size_t *key_length = NULL) const;
|
|
|
|
UInt32 find_last(const char *str,
|
|
std::size_t *key_length = NULL) const;
|
|
UInt32 find_last(const char *ptr, std::size_t length,
|
|
std::size_t *key_length = NULL) const;
|
|
UInt32 find_last(const std::string &str,
|
|
std::size_t *key_length = NULL) const;
|
|
|
|
// bool callback(UInt32 key_id, std::size_t key_length);
|
|
template <typename T>
|
|
std::size_t find_callback(const char *str, T callback) const;
|
|
template <typename T>
|
|
std::size_t find_callback(const char *ptr, std::size_t length,
|
|
T callback) const;
|
|
template <typename T>
|
|
std::size_t find_callback(const std::string &str, T callback) const;
|
|
|
|
std::size_t predict(const char *str,
|
|
UInt32 *key_ids, std::string *keys, std::size_t max_num_results) const;
|
|
std::size_t predict(const char *ptr, std::size_t length,
|
|
UInt32 *key_ids, std::string *keys, std::size_t max_num_results) const;
|
|
std::size_t predict(const std::string &str,
|
|
UInt32 *key_ids, std::string *keys, std::size_t max_num_results) const;
|
|
|
|
std::size_t predict(const char *str,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::string> *keys = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
std::size_t predict(const char *ptr, std::size_t length,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::string> *keys = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
std::size_t predict(const std::string &str,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::string> *keys = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
|
|
std::size_t predict_breadth_first(const char *str,
|
|
UInt32 *key_ids, std::string *keys, std::size_t max_num_results) const;
|
|
std::size_t predict_breadth_first(const char *ptr, std::size_t length,
|
|
UInt32 *key_ids, std::string *keys, std::size_t max_num_results) const;
|
|
std::size_t predict_breadth_first(const std::string &str,
|
|
UInt32 *key_ids, std::string *keys, std::size_t max_num_results) const;
|
|
|
|
std::size_t predict_breadth_first(const char *str,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::string> *keys = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
std::size_t predict_breadth_first(const char *ptr, std::size_t length,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::string> *keys = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
std::size_t predict_breadth_first(const std::string &str,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::string> *keys = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
|
|
std::size_t predict_depth_first(const char *str,
|
|
UInt32 *key_ids, std::string *keys, std::size_t max_num_results) const;
|
|
std::size_t predict_depth_first(const char *ptr, std::size_t length,
|
|
UInt32 *key_ids, std::string *keys, std::size_t max_num_results) const;
|
|
std::size_t predict_depth_first(const std::string &str,
|
|
UInt32 *key_ids, std::string *keys, std::size_t max_num_results) const;
|
|
|
|
std::size_t predict_depth_first(const char *str,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::string> *keys = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
std::size_t predict_depth_first(const char *ptr, std::size_t length,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::string> *keys = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
std::size_t predict_depth_first(const std::string &str,
|
|
std::vector<UInt32> *key_ids = NULL,
|
|
std::vector<std::string> *keys = NULL,
|
|
std::size_t max_num_results = MARISA_MAX_NUM_KEYS) const;
|
|
|
|
// bool callback(UInt32 key_id, const std::string &key);
|
|
template <typename T>
|
|
std::size_t predict_callback(const char *str, T callback) const;
|
|
template <typename T>
|
|
std::size_t predict_callback(const char *ptr, std::size_t length,
|
|
T callback) const;
|
|
template <typename T>
|
|
std::size_t predict_callback(const std::string &str, T callback) const;
|
|
|
|
bool empty() const;
|
|
std::size_t num_tries() const;
|
|
std::size_t num_keys() const;
|
|
std::size_t num_nodes() const;
|
|
std::size_t total_size() const;
|
|
|
|
void clear();
|
|
void swap(Trie *rhs);
|
|
|
|
static UInt32 notfound();
|
|
static std::size_t mismatch();
|
|
|
|
private:
|
|
BitVector louds_;
|
|
Vector<UInt8> labels_;
|
|
BitVector terminal_flags_;
|
|
BitVector link_flags_;
|
|
IntVector links_;
|
|
std::auto_ptr<Trie> trie_;
|
|
Tail tail_;
|
|
UInt32 num_first_branches_;
|
|
UInt32 num_keys_;
|
|
|
|
void build_trie(Vector<Key<String> > &keys,
|
|
std::vector<UInt32> *key_ids, int flags);
|
|
void build_trie(Vector<Key<String> > &keys,
|
|
UInt32 *key_ids, int flags);
|
|
|
|
template <typename T>
|
|
void build_trie(Vector<Key<T> > &keys,
|
|
Vector<UInt32> *terminals, Progress &progress);
|
|
|
|
template <typename T>
|
|
void build_cur(Vector<Key<T> > &keys,
|
|
Vector<UInt32> *terminals, Progress &progress);
|
|
|
|
void build_next(Vector<Key<String> > &keys,
|
|
Vector<UInt32> *terminals, Progress &progress);
|
|
void build_next(Vector<Key<RString> > &rkeys,
|
|
Vector<UInt32> *terminals, Progress &progress);
|
|
|
|
template <typename T>
|
|
UInt32 sort_keys(Vector<Key<T> > &keys) const;
|
|
|
|
template <typename T>
|
|
void build_terminals(const Vector<Key<T> > &keys,
|
|
Vector<UInt32> *terminals) const;
|
|
|
|
void restore_(UInt32 key_id, std::string *key) const;
|
|
void trie_restore(UInt32 node, std::string *key) const;
|
|
void tail_restore(UInt32 node, std::string *key) const;
|
|
|
|
std::size_t restore_(UInt32 key_id, char *key_buf,
|
|
std::size_t key_buf_size) const;
|
|
void trie_restore(UInt32 node, char *key_buf,
|
|
std::size_t key_buf_size, std::size_t &key_pos) const;
|
|
void tail_restore(UInt32 node, char *key_buf,
|
|
std::size_t key_buf_size, std::size_t &key_pos) const;
|
|
|
|
template <typename T>
|
|
UInt32 lookup_(T query) const;
|
|
template <typename T>
|
|
bool find_child(UInt32 &node, T query, std::size_t &pos) const;
|
|
template <typename T>
|
|
std::size_t trie_match(UInt32 node, T query, std::size_t pos) const;
|
|
template <typename T>
|
|
std::size_t tail_match(UInt32 node, UInt32 link_id,
|
|
T query, std::size_t pos) const;
|
|
|
|
template <typename T, typename U, typename V>
|
|
std::size_t find_(T query, U key_ids, V key_lengths,
|
|
std::size_t max_num_results) const;
|
|
template <typename T>
|
|
UInt32 find_first_(T query, std::size_t *key_length) const;
|
|
template <typename T>
|
|
UInt32 find_last_(T query, std::size_t *key_length) const;
|
|
template <typename T, typename U>
|
|
std::size_t find_callback_(T query, U callback) const;
|
|
|
|
template <typename T, typename U, typename V>
|
|
std::size_t predict_breadth_first_(T query, U key_ids, V keys,
|
|
std::size_t max_num_results) const;
|
|
template <typename T, typename U, typename V>
|
|
std::size_t predict_depth_first_(T query, U key_ids, V keys,
|
|
std::size_t max_num_results) const;
|
|
template <typename T, typename U>
|
|
std::size_t predict_callback_(T query, U callback) const;
|
|
|
|
template <typename T>
|
|
bool predict_child(UInt32 &node, T query, std::size_t &pos,
|
|
std::string *key) const;
|
|
template <typename T>
|
|
std::size_t trie_prefix_match(UInt32 node, T query,
|
|
std::size_t pos, std::string *key) const;
|
|
template <typename T>
|
|
std::size_t tail_prefix_match(UInt32 node, UInt32 link_id,
|
|
T query, std::size_t pos, std::string *key) const;
|
|
|
|
UInt32 key_id_to_node(UInt32 key_id) const;
|
|
UInt32 node_to_key_id(UInt32 node) const;
|
|
UInt32 louds_pos_to_node(UInt32 louds_pos, UInt32 parent_node) const;
|
|
|
|
UInt32 get_child(UInt32 node) const;
|
|
UInt32 get_parent(UInt32 node) const;
|
|
|
|
bool has_link(UInt32 node) const;
|
|
UInt32 get_link_id(UInt32 node) const;
|
|
UInt32 get_link(UInt32 node) const;
|
|
UInt32 get_link(UInt32 node, UInt32 link_id) const;
|
|
|
|
bool has_link() const;
|
|
bool has_trie() const;
|
|
bool has_tail() const;
|
|
|
|
// Disallows copy and assignment.
|
|
Trie(const Trie &);
|
|
Trie &operator=(const Trie &);
|
|
};
|
|
|
|
} // namespace marisa
|
|
|
|
#include "trie-inline.h"
|
|
|
|
#else // __cplusplus
|
|
|
|
#include <stdio.h>
|
|
|
|
#endif // __cplusplus
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif // __cplusplus
|
|
|
|
typedef struct marisa_trie_ marisa_trie;
|
|
|
|
marisa_status marisa_init(marisa_trie **h);
|
|
marisa_status marisa_end(marisa_trie *h);
|
|
|
|
marisa_status marisa_build(marisa_trie *h, const char * const *keys,
|
|
size_t num_keys, const size_t *key_lengths, const double *key_weights,
|
|
marisa_uint32 *key_ids, int flags);
|
|
|
|
marisa_status marisa_mmap(marisa_trie *h, const char *filename,
|
|
long offset, int whence);
|
|
marisa_status marisa_map(marisa_trie *h, const void *ptr, size_t size);
|
|
|
|
marisa_status marisa_load(marisa_trie *h, const char *filename,
|
|
long offset, int whence);
|
|
marisa_status marisa_fread(marisa_trie *h, FILE *file);
|
|
marisa_status marisa_read(marisa_trie *h, int fd);
|
|
|
|
marisa_status marisa_save(const marisa_trie *h, const char *filename,
|
|
int trunc_flag, long offset, int whence);
|
|
marisa_status marisa_fwrite(const marisa_trie *h, FILE *file);
|
|
marisa_status marisa_write(const marisa_trie *h, int fd);
|
|
|
|
marisa_status marisa_restore(const marisa_trie *h, marisa_uint32 key_id,
|
|
char *key_buf, size_t key_buf_size, size_t *key_length);
|
|
|
|
marisa_status marisa_lookup(const marisa_trie *h,
|
|
const char *ptr, size_t length, marisa_uint32 *key_id);
|
|
|
|
marisa_status marisa_find(const marisa_trie *h,
|
|
const char *ptr, size_t length,
|
|
marisa_uint32 *key_ids, size_t *key_lengths,
|
|
size_t max_num_results, size_t *num_results);
|
|
marisa_status marisa_find_first(const marisa_trie *h,
|
|
const char *ptr, size_t length,
|
|
marisa_uint32 *key_id, size_t *key_length);
|
|
marisa_status marisa_find_last(const marisa_trie *h,
|
|
const char *ptr, size_t length,
|
|
marisa_uint32 *key_id, size_t *key_length);
|
|
marisa_status marisa_find_callback(const marisa_trie *h,
|
|
const char *ptr, size_t length,
|
|
int (*callback)(void *, marisa_uint32, size_t),
|
|
void *first_arg_to_callback);
|
|
|
|
marisa_status marisa_predict(const marisa_trie *h,
|
|
const char *ptr, size_t length, marisa_uint32 *key_ids,
|
|
size_t max_num_results, size_t *num_results);
|
|
marisa_status marisa_predict_breadth_first(const marisa_trie *h,
|
|
const char *ptr, size_t length, marisa_uint32 *key_ids,
|
|
size_t max_num_results, size_t *num_results);
|
|
marisa_status marisa_predict_depth_first(const marisa_trie *h,
|
|
const char *ptr, size_t length, marisa_uint32 *key_ids,
|
|
size_t max_num_results, size_t *num_results);
|
|
marisa_status marisa_predict_callback(const marisa_trie *h,
|
|
const char *ptr, size_t length,
|
|
int (*callback)(void *, marisa_uint32, const char *, size_t),
|
|
void *first_arg_to_callback);
|
|
|
|
size_t marisa_get_num_tries(const marisa_trie *h);
|
|
size_t marisa_get_num_keys(const marisa_trie *h);
|
|
size_t marisa_get_num_nodes(const marisa_trie *h);
|
|
size_t marisa_get_total_size(const marisa_trie *h);
|
|
|
|
marisa_status marisa_clear(marisa_trie *h);
|
|
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
#endif // __cplusplus
|
|
|
|
#endif // MARISA_TRIE_H_
|