11/25
This commit is contained in:
@@ -0,0 +1,96 @@
|
||||
/*
|
||||
* Copyright (C) 2009 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef PINYINIME_INCLUDE_NGRAM_H__
|
||||
#define PINYINIME_INCLUDE_NGRAM_H__
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "./dictdef.h"
|
||||
|
||||
namespace ime_pinyin {
|
||||
|
||||
typedef unsigned char CODEBOOK_TYPE;
|
||||
|
||||
static const size_t kCodeBookSize = 256;
|
||||
|
||||
class NGram {
|
||||
public:
|
||||
// The maximum score of a lemma item.
|
||||
static const LmaScoreType kMaxScore = 0x3fff;
|
||||
|
||||
// In order to reduce the storage size, the original log value is amplified by
|
||||
// kScoreAmplifier, and we use LmaScoreType to store.
|
||||
// After this process, an item with a lower score has a higher frequency.
|
||||
static const int kLogValueAmplifier = -800;
|
||||
|
||||
// System words' total frequency. It is not the real total frequency, instead,
|
||||
// It is only used to adjust system lemmas' scores when the user dictionary's
|
||||
// total frequency changes.
|
||||
// In this version, frequencies of system lemmas are fixed. We are considering
|
||||
// to make them changable in next version.
|
||||
static const size_t kSysDictTotalFreq = 100000000;
|
||||
|
||||
private:
|
||||
|
||||
static NGram* instance_;
|
||||
|
||||
bool initialized_;
|
||||
uint32 idx_num_;
|
||||
|
||||
size_t total_freq_none_sys_;
|
||||
|
||||
// Score compensation for system dictionary lemmas.
|
||||
// Because after user adds some user lemmas, the total frequency changes, and
|
||||
// we use this value to normalize the score.
|
||||
float sys_score_compensation_;
|
||||
|
||||
#ifdef ___BUILD_MODEL___
|
||||
double *freq_codes_df_;
|
||||
#endif
|
||||
LmaScoreType *freq_codes_;
|
||||
CODEBOOK_TYPE *lma_freq_idx_;
|
||||
|
||||
public:
|
||||
NGram();
|
||||
~NGram();
|
||||
|
||||
static NGram& get_instance();
|
||||
|
||||
bool save_ngram(FILE *fp);
|
||||
bool load_ngram(FILE *fp);
|
||||
|
||||
// Set the total frequency of all none system dictionaries.
|
||||
void set_total_freq_none_sys(size_t freq_none_sys);
|
||||
|
||||
float get_uni_psb(LemmaIdType lma_id);
|
||||
|
||||
// Convert a probability to score. Actually, the score will be limited to
|
||||
// kMaxScore, but at runtime, we also need float expression to get accurate
|
||||
// value of the score.
|
||||
// After the conversion, a lower score indicates a higher probability of the
|
||||
// item.
|
||||
static float convert_psb_to_score(double psb);
|
||||
|
||||
#ifdef ___BUILD_MODEL___
|
||||
// For constructing the unigram mode model.
|
||||
bool build_unigram(LemmaEntry *lemma_arr, size_t num,
|
||||
LemmaIdType next_idx_unused);
|
||||
#endif
|
||||
};
|
||||
}
|
||||
|
||||
#endif // PINYINIME_INCLUDE_NGRAM_H__
|
||||
Reference in New Issue
Block a user