当前位置:网站首页>Sogou cell thesaurus analysis (only extract words and word frequency)

Sogou cell thesaurus analysis (only extract words and word frequency)

2022-04-23 18:52:00 Brick Porter

#pragma once
#include <string>
#include <list>
#include <fstream>

struct Data {
public:
	Data(std::wstring _word, UINT16 count) :word(_word), byRate(count)
	{
		if (byRate > 250)
			byRate = 250;
	}
	std::wstring word;// word 
	BYTE byRate;// Word frequency 
};
class SougouScelReader
{
	//  There are two main parts 
	// 1. Global Pinyin Table , It seems to be all the Pinyin combinations , Dictionary order 
	//  The format is (index,len,pinyin) A list of 
	// index:  A two byte integer   Represents the index of this Pinyin 
	// len:  A two byte integer   Byte length of Pinyin 
	// pinyin:  Current Pinyin , Two bytes per character , Chief, len
	//
	// 2. Chinese phrase list 
	//  The format is (same,py_table_len,py_table,{word_len,word,ext_len,ext}) A list of 
	// same:  Two bytes   Integers   Number of homonyms 
	// py_table_len:   Two bytes   Integers 
	// py_table:  List of integers , Two bytes per integer , Each integer represents a Pinyin index 
	//
	// word_len: Two bytes   Integers   Represents the number of bytes and length of Chinese phrases 
	// word:  Chinese phrases , Each Chinese character has two bytes , Total length word_len
	// ext_len:  Two bytes   Integers   Represents the length of extended information , It's all like 10
	// ext:  Extended information   The first two bytes are an integer ( I don't know if it's word frequency )  The last eight bytes are all 0
	//
	//      {word_len,word,ext_len,ext}  Repeat it all same Time   Homonyms   Same Pinyin Table 

public:
	//#  Pinyin Table offset ,
	static const INT32 startPy = 0x1540;

	//  Chinese phrase list offset 
	static const INT32 startChinese = 0x2628;

	//  Global Pinyin Table           

	//  Analysis results 
	//  Tuples ( Word frequency , pinyin , Chinese phrases ) A list of 


	//  Convert the original bytecode to a string 
	std::wstring byte2str(byte data[], size_t len)const
	{
		int pos = 0;
		std::wstring str;
		while (pos < len)
		{
			wchar_t c = (wchar_t)(data[pos + 1] << 8 | data[pos]);
			if (c != 0)
			{
				str += c;
			}
			pos += 2;
		}
		return str;
	}

	void getChinese(byte data[], size_t len, std::list<Data> &out)const
	{
		int pos = 0;
		while (pos < len)
		{
			// Number of homonyms 
			UINT16 same = data[pos + 1] << 8 | data[pos];
			//  Pinyin index table length 
			pos += 2;
			UINT16 py_table_len = data[pos + 1] << 8 | data[pos];
			//  Pinyin index table 
			pos += 2;
			//  Chinese phrases 
			pos += py_table_len;
			for (int i = 0; i < same; i++)
			{
				//  Chinese phrase length 
				INT16 c_len = data[pos + 1] << 8 | data[pos];
				//  Chinese phrases 
				pos += 2;
				std::wstring word = byte2str(data + pos, c_len);
				//  Extended data length 
				pos += c_len;
				UINT16 ext_len = data[pos + 1] << 8 | data[pos];
				//  Word frequency 
				pos += 2;
				UINT16 count = data[pos + 1] << 8 | data[pos];
				out.push_back(Data(word, count));
				pos += ext_len;
			}
		}
	}
};
class CSogoScelParse
{
	std::wstring name;
	std::list<Data> words;
public:
	CSogoScelParse(std::wstring inputPath)
	{
		std::ifstream infile(inputPath.c_str(), std::ios_base::binary| std::ios_base::in);
		if (infile.is_open())
		{
			infile.seekg(0, std::ios_base::end);
			int nFileLen = infile.tellg();
			infile.seekg(0, std::ios_base::beg);
			byte* buffes = new byte[nFileLen];
			if (buffes)
			{
				SougouScelReader scelReader;
				if (nFileLen < scelReader.startChinese)// The file is too small , No further verification has been performed for the time being .
				{
					delete buffes;
					infile.close();
					return;
				}
				infile.read((char*)buffes, nFileLen);
				infile.close();
				// Thesaurus name 
				name = scelReader.byte2str(buffes+0x130, 0x338-0x130);
				// Parse word list 
				scelReader.getChinese(buffes + scelReader.startChinese, nFileLen - scelReader.startChinese,words);
			}
			delete buffes;			
		}		
	}
	size_t GetWordCount()const
	{
		return words.size();
	}
	const std::list<Data>& GetWordList()const
	{
		return words;
	}
};

版权声明
本文为[Brick Porter]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/04/202204210603257515.html