11/25
This commit is contained in:
@@ -0,0 +1,171 @@
|
||||
/*
|
||||
* Copyright (C) 2009 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__
|
||||
#define PINYINIME_INCLUDE_DICTBUILDER_H__
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "./utf16char.h"
|
||||
#include "./dictdef.h"
|
||||
#include "./dictlist.h"
|
||||
#include "./spellingtable.h"
|
||||
#include "./spellingtrie.h"
|
||||
#include "./splparser.h"
|
||||
|
||||
namespace ime_pinyin {
|
||||
|
||||
#ifdef ___BUILD_MODEL___
|
||||
|
||||
#define ___DO_STATISTICS___
|
||||
|
||||
class DictTrie;
|
||||
|
||||
class DictBuilder {
|
||||
private:
|
||||
// The raw lemma array buffer.
|
||||
LemmaEntry *lemma_arr_;
|
||||
size_t lemma_num_;
|
||||
|
||||
// Used to store all possible single char items.
|
||||
// Two items may have the same Hanzi while their spelling ids are different.
|
||||
SingleCharItem *scis_;
|
||||
size_t scis_num_;
|
||||
|
||||
// In the tree, root's level is -1.
|
||||
// Lemma nodes for root, and level 0
|
||||
LmaNodeLE0 *lma_nodes_le0_;
|
||||
|
||||
// Lemma nodes for layers whose levels are deeper than 0
|
||||
LmaNodeGE1 *lma_nodes_ge1_;
|
||||
|
||||
// Number of used lemma nodes
|
||||
size_t lma_nds_used_num_le0_;
|
||||
size_t lma_nds_used_num_ge1_;
|
||||
|
||||
// Used to store homophonies' ids.
|
||||
LemmaIdType *homo_idx_buf_;
|
||||
// Number of homophonies each of which only contains one Chinese character.
|
||||
size_t homo_idx_num_eq1_;
|
||||
// Number of homophonies each of which contains more than one character.
|
||||
size_t homo_idx_num_gt1_;
|
||||
|
||||
// The items with highest scores.
|
||||
LemmaEntry *top_lmas_;
|
||||
size_t top_lmas_num_;
|
||||
|
||||
SpellingTable *spl_table_;
|
||||
SpellingParser *spl_parser_;
|
||||
|
||||
#ifdef ___DO_STATISTICS___
|
||||
size_t max_sonbuf_len_[kMaxLemmaSize];
|
||||
size_t max_homobuf_len_[kMaxLemmaSize];
|
||||
|
||||
size_t total_son_num_[kMaxLemmaSize];
|
||||
size_t total_node_hasson_[kMaxLemmaSize];
|
||||
size_t total_sonbuf_num_[kMaxLemmaSize];
|
||||
size_t total_sonbuf_allnoson_[kMaxLemmaSize];
|
||||
size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize];
|
||||
size_t total_homo_num_[kMaxLemmaSize];
|
||||
|
||||
size_t sonbufs_num1_; // Number of son buffer with only 1 son
|
||||
size_t sonbufs_numgt1_; // Number of son buffer with more 1 son;
|
||||
|
||||
size_t total_lma_node_num_;
|
||||
|
||||
void stat_init();
|
||||
void stat_print();
|
||||
#endif
|
||||
|
||||
public:
|
||||
|
||||
DictBuilder();
|
||||
~DictBuilder();
|
||||
|
||||
// Build dictionary trie from the file fn_raw. File fn_validhzs provides
|
||||
// valid chars. If fn_validhzs is NULL, only chars in GB2312 will be
|
||||
// included.
|
||||
bool build_dict(const char* fn_raw, const char* fn_validhzs,
|
||||
DictTrie *dict_trie);
|
||||
|
||||
private:
|
||||
// Fill in the buffer with id. The caller guarantees that the paramters are
|
||||
// vaild.
|
||||
void id_to_charbuf(unsigned char *buf, LemmaIdType id);
|
||||
|
||||
// Update the offset of sons for a node.
|
||||
void set_son_offset(LmaNodeGE1 *node, size_t offset);
|
||||
|
||||
// Update the offset of homophonies' ids for a node.
|
||||
void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset);
|
||||
|
||||
// Format a speling string.
|
||||
void format_spelling_str(char *spl_str);
|
||||
|
||||
// Sort the lemma_arr by the hanzi string, and give each of unique items
|
||||
// a id. Why we need to sort the lemma list according to their Hanzi string
|
||||
// is to find items started by a given prefix string to do prediction.
|
||||
// Actually, the single char items are be in other order, for example,
|
||||
// in spelling id order, etc.
|
||||
// Return value is next un-allocated idx available.
|
||||
LemmaIdType sort_lemmas_by_hz();
|
||||
|
||||
// Build the SingleCharItem list, and fill the hanzi_scis_ids in the
|
||||
// lemma buffer lemma_arr_.
|
||||
// This function should be called after the lemma array is ready.
|
||||
// Return the number of unique SingleCharItem elements.
|
||||
size_t build_scis();
|
||||
|
||||
// Construct a subtree using a subset of the spelling array (from
|
||||
// item_star to item_end)
|
||||
// parent is the parent node to update the necessary information
|
||||
// parent can be a member of LmaNodeLE0 or LmaNodeGE1
|
||||
bool construct_subset(void* parent, LemmaEntry* lemma_arr,
|
||||
size_t item_start, size_t item_end, size_t level);
|
||||
|
||||
|
||||
// Read valid Chinese Hanzis from the given file.
|
||||
// num is used to return number of chars.
|
||||
// The return buffer is sorted and caller needs to free the returned buffer.
|
||||
char16* read_valid_hanzis(const char *fn_validhzs, size_t *num);
|
||||
|
||||
|
||||
// Read a raw dictionary. max_item is the maximum number of items. If there
|
||||
// are more items in the ditionary, only the first max_item will be read.
|
||||
// Returned value is the number of items successfully read from the file.
|
||||
size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs,
|
||||
size_t max_item);
|
||||
|
||||
// Try to find if a character is in hzs buffer.
|
||||
bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz);
|
||||
|
||||
// Try to find if all characters in str are in hzs buffer.
|
||||
bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
|
||||
const char16 *str, size_t str_len);
|
||||
|
||||
// Get these lemmas with toppest scores.
|
||||
void get_top_lemmas();
|
||||
|
||||
// Allocate resource to build dictionary.
|
||||
// lma_num is the number of items to be loaded
|
||||
bool alloc_resource(size_t lma_num);
|
||||
|
||||
// Free resource.
|
||||
void free_resource();
|
||||
};
|
||||
#endif // ___BUILD_MODEL___
|
||||
}
|
||||
|
||||
#endif // PINYINIME_INCLUDE_DICTBUILDER_H__
|
||||
Reference in New Issue
Block a user