add: data-structure-lab & compiler-lab
This commit is contained in:
3
zip-unzip-search/.gitignore
vendored
Normal file
3
zip-unzip-search/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
.idea/
|
||||
cmake-*/
|
||||
build/
|
10
zip-unzip-search/CMakeLists.txt
Normal file
10
zip-unzip-search/CMakeLists.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
cmake_minimum_required(VERSION 3.22)
|
||||
project(zip_unzip_search)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
|
||||
include_directories(${PROJECT_SOURCE_DIR}/include)
|
||||
|
||||
aux_source_directory(${PROJECT_SOURCE_DIR}/src SRCS)
|
||||
|
||||
add_executable(zip_unzip_search main.cpp ${SRCS})
|
11
zip-unzip-search/include/const.h
Normal file
11
zip-unzip-search/include/const.h
Normal file
@@ -0,0 +1,11 @@
|
||||
//
|
||||
// Created by ricardo on 22-12-11.
|
||||
//
|
||||
|
||||
#ifndef ZIP_UNZIP_SEARCH_CONST_H
|
||||
#define ZIP_UNZIP_SEARCH_CONST_H
|
||||
|
||||
// ASCII码的长度
|
||||
#define ASCII_LENGTH 128
|
||||
|
||||
#endif //ZIP_UNZIP_SEARCH_CONST_H
|
82
zip-unzip-search/include/file_io.h
Normal file
82
zip-unzip-search/include/file_io.h
Normal file
@@ -0,0 +1,82 @@
|
||||
//
|
||||
// Created by ricardo on 22-12-11.
|
||||
//
|
||||
|
||||
#ifndef ZIP_UNZIP_SEARCH_FILE_IO_H
|
||||
#define ZIP_UNZIP_SEARCH_FILE_IO_H
|
||||
#include "string"
|
||||
|
||||
/**
|
||||
* 文件元信息
|
||||
*/
|
||||
struct MetaData
|
||||
{
|
||||
/**
|
||||
* 哈夫曼树节点数组长度
|
||||
*/
|
||||
int HuffmanNodeLength;
|
||||
/**
|
||||
* 哈夫曼树根节点长度
|
||||
*/
|
||||
int HuffmanRoot;
|
||||
/**
|
||||
* 文件中最后一个缓冲区被使用的位数
|
||||
*/
|
||||
int LastBufferUsedLength;
|
||||
};
|
||||
|
||||
class FileIO
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* 统计文件的字符的出现频率
|
||||
* @param fileName 文件名称
|
||||
* @return 字符频率数组 需要delete
|
||||
*/
|
||||
static int* ReadCharFrequency(const std::string& fileName);
|
||||
|
||||
/**
|
||||
* 写入压缩文件
|
||||
* @param inputFile 被压缩文件名称
|
||||
* @param outputFile 输出压缩文件名称
|
||||
*/
|
||||
static void WriteZipFile(const std::string& inputFile, const std::string& outputFile);
|
||||
|
||||
/**
|
||||
* 写入解压缩文件
|
||||
* @param inputFile 压缩文件名称
|
||||
* @param outputFile 解压文件名称
|
||||
*/
|
||||
static void WriteUnzipFile(const std::string& inputFile, const std::string& outputFile);
|
||||
|
||||
/**
|
||||
* 计算文件的压缩率
|
||||
* @param inputFileName
|
||||
* @param outputFileName
|
||||
* @return
|
||||
*/
|
||||
static double CalculateZipRate(const std::string& inputFileName, const std::string& outputFileName);
|
||||
};
|
||||
|
||||
class BinaryBuffer
|
||||
{
|
||||
public:
|
||||
explicit BinaryBuffer(std::string& inputFileName);
|
||||
|
||||
~BinaryBuffer();
|
||||
|
||||
char read();
|
||||
|
||||
int position = 0;
|
||||
|
||||
private:
|
||||
FILE* file = nullptr;
|
||||
|
||||
int buffer;
|
||||
int bufferPos;
|
||||
bool readFinishedFlag;
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif //ZIP_UNZIP_SEARCH_FILE_IO_H
|
103
zip-unzip-search/include/huffman.h
Normal file
103
zip-unzip-search/include/huffman.h
Normal file
@@ -0,0 +1,103 @@
|
||||
//
|
||||
// Created by ricardo on 22-12-11.
|
||||
//
|
||||
|
||||
#ifndef ZIP_UNZIP_SEARCH_HUFFMAN_H
|
||||
#define ZIP_UNZIP_SEARCH_HUFFMAN_H
|
||||
#include "vector"
|
||||
#include "array"
|
||||
#include "const.h"
|
||||
|
||||
/**
|
||||
* 哈夫曼树中的节点结构体
|
||||
*/
|
||||
struct HuffmanNode
|
||||
{
|
||||
/**
|
||||
* 节点的编号
|
||||
*/
|
||||
int id;
|
||||
/**
|
||||
* 节点表示的字符
|
||||
* 如果不是叶子节点
|
||||
* 值为-1
|
||||
*/
|
||||
char data;
|
||||
/**
|
||||
* 字符在文件中出现的频率
|
||||
* 也就是哈夫曼树中节点的权值
|
||||
*/
|
||||
int frequency;
|
||||
/**
|
||||
* 左子结点在节点数组中的索引
|
||||
*/
|
||||
int lIndex;
|
||||
/**
|
||||
* 右子结点在节点中的索引
|
||||
*/
|
||||
int rIndex;
|
||||
};
|
||||
|
||||
class HuffmanCode
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* 树中节点列表
|
||||
*/
|
||||
std::vector<HuffmanNode>* nodes = new std::vector<HuffmanNode>();
|
||||
|
||||
/**
|
||||
* 哈夫曼树根节点索引
|
||||
*/
|
||||
int root = -1;
|
||||
|
||||
/**
|
||||
* 从字符的频率数组出发创建
|
||||
* @param frequencyArray
|
||||
*/
|
||||
explicit HuffmanCode(const int * frequencyArray);
|
||||
|
||||
/**
|
||||
* 从哈夫曼数组节点出发创建
|
||||
* @param nodeArray 哈夫曼节点数组
|
||||
* @param length 节点数组的长度
|
||||
*/
|
||||
HuffmanCode(HuffmanNode *nodeArray, int length);
|
||||
|
||||
~HuffmanCode();
|
||||
|
||||
/**
|
||||
* 创建哈夫曼树
|
||||
*/
|
||||
void createHuffmanTree();
|
||||
|
||||
/**
|
||||
* 打印哈夫曼树
|
||||
*/
|
||||
void printHuffmanTree();
|
||||
|
||||
/**
|
||||
* 得到哈夫曼编码
|
||||
* @return 哈夫曼编码字典
|
||||
*/
|
||||
std::array<std::vector<char>, ASCII_LENGTH> * getHuffmanCode();
|
||||
|
||||
/**
|
||||
* 打印哈夫曼编码字典
|
||||
* @param dictionary 字典
|
||||
*/
|
||||
static void printHuffmanCode(const std::array<std::vector<char>, ASCII_LENGTH>& dictionary);
|
||||
|
||||
private:
|
||||
/**
|
||||
* 对森林列表按权值排序
|
||||
* @param forests
|
||||
*/
|
||||
static void sortForests(std::vector<HuffmanNode>& forests);
|
||||
|
||||
void printHuffmanTreeR(int nodeId);
|
||||
|
||||
void getHuffmanCodeR(std::array<std::vector<char>, ASCII_LENGTH> &dictionary, int nodeId, std::vector<char> &code);
|
||||
|
||||
};
|
||||
#endif //ZIP_UNZIP_SEARCH_HUFFMAN_H
|
31
zip-unzip-search/include/logging.h
Normal file
31
zip-unzip-search/include/logging.h
Normal file
@@ -0,0 +1,31 @@
|
||||
//
|
||||
// Created by ricardo on 22-12-11.
|
||||
//
|
||||
|
||||
#ifndef ZIP_UNZIP_SEARCH_LOGGING_H
|
||||
#define ZIP_UNZIP_SEARCH_LOGGING_H
|
||||
#include "string"
|
||||
|
||||
class Logging
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* 输出信息
|
||||
* @param info
|
||||
*/
|
||||
static void LoggingInfo(const std::string& info);
|
||||
|
||||
/**
|
||||
* 输出警告
|
||||
* @param warning
|
||||
*/
|
||||
static void LoggingWarning(const std::string& warning);
|
||||
|
||||
/**
|
||||
* 输出错误
|
||||
* @param error
|
||||
*/
|
||||
static void LoggingError(const std::string& error);
|
||||
};
|
||||
|
||||
#endif //ZIP_UNZIP_SEARCH_LOGGING_H
|
55
zip-unzip-search/include/search.h
Normal file
55
zip-unzip-search/include/search.h
Normal file
@@ -0,0 +1,55 @@
|
||||
//
|
||||
// Created by ricardo on 22-12-16.
|
||||
//
|
||||
|
||||
#ifndef ZIP_UNZIP_SEARCH_SEARCH_H
|
||||
#define ZIP_UNZIP_SEARCH_SEARCH_H
|
||||
#include "vector"
|
||||
#include "array"
|
||||
#include "string"
|
||||
|
||||
/**
|
||||
* BM算法搜索实现类
|
||||
*/
|
||||
class BMSearch
|
||||
{
|
||||
public:
|
||||
explicit BMSearch(std::vector<char>& sample);
|
||||
|
||||
~BMSearch();
|
||||
|
||||
/**
|
||||
* 匹配二进制文件
|
||||
* @param fileName 指向需要进行匹配的二进制文件 需要读取元信息和哈夫曼数组
|
||||
*/
|
||||
void matchFile(std::string &fileName);
|
||||
|
||||
private:
|
||||
// 坏字符规则数组
|
||||
// 字符串为01串
|
||||
int* badCharArray;
|
||||
|
||||
// 好后缀规则数组
|
||||
int* goodSuffixArray;
|
||||
|
||||
std::vector<char>* sample;
|
||||
|
||||
/**
|
||||
* 生成坏字符数组
|
||||
* @param s 模板字符串
|
||||
*/
|
||||
void generateBrokenCharArray(std::vector<char>& s);
|
||||
|
||||
/**
|
||||
* 生成好后缀数组
|
||||
* @param s 模板字符串
|
||||
*/
|
||||
void generateGoodSuffixArray(std::vector<char>& s);
|
||||
|
||||
static int max(int a, int b);
|
||||
|
||||
};
|
||||
|
||||
void SearchInFile(char* fileName, char* sample);
|
||||
|
||||
#endif //ZIP_UNZIP_SEARCH_SEARCH_H
|
67
zip-unzip-search/main.cpp
Normal file
67
zip-unzip-search/main.cpp
Normal file
@@ -0,0 +1,67 @@
|
||||
#include "file_io.h"
|
||||
#include "logging.h"
|
||||
#include "cstring"
|
||||
#include "search.h"
|
||||
|
||||
/**
|
||||
* 输出帮助信息
|
||||
*/
|
||||
void PrintHelpMessage()
|
||||
{
|
||||
printf("Usage: \n");
|
||||
printf("Zip File: -z [In-File-Name] [Out-File-Name]\n");
|
||||
printf("Unzip File: -u [In-File-Name] [Out-File-Name]\n");
|
||||
printf("Search In Zip File: -s [Zip-File-Name] [Sample-String]\n");
|
||||
printf("Print Help Message: -h\n");
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
if (argc == 4)
|
||||
{
|
||||
std::string inputFileName = std::string(argv[2]);
|
||||
std::string outputFileName = std::string(argv[3]);
|
||||
|
||||
if (strcmp(argv[1], "-z") == 0)
|
||||
{
|
||||
Logging::LoggingInfo("Start Zip File: " + inputFileName + " to zip file: " + outputFileName);
|
||||
|
||||
FileIO::WriteZipFile(inputFileName, outputFileName);
|
||||
|
||||
double zipRate = FileIO::CalculateZipRate(inputFileName, outputFileName) * 100.0;
|
||||
Logging::LoggingInfo("The Zip Rate is: " + std::to_string(zipRate) + "%");
|
||||
Logging::LoggingInfo("Zip Success!");
|
||||
}
|
||||
else if(strcmp(argv[1], "-u") == 0)
|
||||
{
|
||||
Logging::LoggingInfo("Start Unzip File: " + inputFileName + " to text file: " + outputFileName);
|
||||
|
||||
FileIO::WriteUnzipFile(inputFileName, outputFileName);
|
||||
|
||||
Logging::LoggingInfo("Unzip Success!");
|
||||
}
|
||||
else if(strcmp(argv[1], "-s") == 0)
|
||||
{
|
||||
Logging::LoggingInfo("Start to search in file " + inputFileName);
|
||||
SearchInFile(argv[2], argv[3]);
|
||||
Logging::LoggingInfo("Search finished");
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Unknown Usage!\n");
|
||||
PrintHelpMessage();
|
||||
}
|
||||
}
|
||||
else if (argc == 2 && strcmp(argv[1], "-h") == 0)
|
||||
{
|
||||
PrintHelpMessage();
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Unknown Usage!\n");
|
||||
PrintHelpMessage();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
307
zip-unzip-search/src/file_io.cpp
Normal file
307
zip-unzip-search/src/file_io.cpp
Normal file
@@ -0,0 +1,307 @@
|
||||
//
|
||||
// Created by ricardo on 22-12-11.
|
||||
//
|
||||
#include "file_io.h"
|
||||
#include "logging.h"
|
||||
#include "cstdio"
|
||||
#include "cstdlib"
|
||||
#include "const.h"
|
||||
#include "huffman.h"
|
||||
#include "unistd.h"
|
||||
#include "sys/stat.h"
|
||||
|
||||
int *FileIO::ReadCharFrequency(const std::string &fileName)
|
||||
{
|
||||
FILE *file = fopen(fileName.c_str(), "r");
|
||||
|
||||
if (file == nullptr)
|
||||
{
|
||||
// 文件打开失败
|
||||
Logging::LoggingInfo(fileName + "is not a valid filename");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
int* frequencyArray = new int[ASCII_LENGTH];
|
||||
|
||||
for (int i = 0; i < ASCII_LENGTH; i++)
|
||||
{
|
||||
// 将所有频率初始化为0
|
||||
frequencyArray[i] = 0;
|
||||
}
|
||||
|
||||
while (true)
|
||||
{
|
||||
int temp = fgetc(file);
|
||||
|
||||
if (temp == EOF)
|
||||
{
|
||||
// 文件结束
|
||||
break;
|
||||
}
|
||||
|
||||
if (temp >= ASCII_LENGTH || temp < 0)
|
||||
{
|
||||
// 读取到非法字符
|
||||
Logging::LoggingWarning(
|
||||
"Read illegal char " + std::to_string(temp) + " in file. Ignore it");
|
||||
}
|
||||
|
||||
frequencyArray[temp]++;
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
return frequencyArray;
|
||||
}
|
||||
|
||||
void FileIO::WriteZipFile(const std::string &inputFile, const std::string &outputFile)
|
||||
{
|
||||
int* frequencyArray = FileIO::ReadCharFrequency(inputFile);
|
||||
|
||||
auto huffmanCode = new HuffmanCode(frequencyArray);
|
||||
// 创建哈夫曼树
|
||||
huffmanCode->createHuffmanTree();
|
||||
auto dictionary = huffmanCode->getHuffmanCode();
|
||||
|
||||
FILE* input = fopen(inputFile.c_str(), "r");
|
||||
FILE* output = fopen(outputFile.c_str(), "wb");
|
||||
|
||||
// 判断文件打开ia是否成功
|
||||
if (input == nullptr)
|
||||
{
|
||||
Logging::LoggingError(inputFile + " is not an valid file name.");
|
||||
exit(0);
|
||||
}
|
||||
if (output == nullptr)
|
||||
{
|
||||
Logging::LoggingError(outputFile + " is not an valid file name.");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// 首先写入文件的元信息
|
||||
// 虽然目前元信息中部分信息还没有拿到
|
||||
// 但是先把文件中的空间占据了再说
|
||||
MetaData metaDataT{};
|
||||
fwrite(&metaDataT, sizeof(MetaData), 1, output);
|
||||
|
||||
// 写入哈夫曼数组
|
||||
fwrite(huffmanCode->nodes->data(), sizeof(HuffmanNode), huffmanCode->nodes->size(), output);
|
||||
|
||||
// 写入文件时的缓冲区
|
||||
int buffer = 0;
|
||||
int bufferPos = 0;
|
||||
|
||||
while (true)
|
||||
{
|
||||
int temp = fgetc(input);
|
||||
|
||||
// 读取到文件末尾
|
||||
if (temp == EOF)
|
||||
{
|
||||
|
||||
buffer = buffer << (32 - bufferPos);
|
||||
fwrite(&buffer, sizeof(int), 1, output);
|
||||
|
||||
metaDataT.LastBufferUsedLength = bufferPos;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (temp >= ASCII_LENGTH || temp < 0)
|
||||
{
|
||||
// 读取到非法字符
|
||||
Logging::LoggingWarning(
|
||||
"Read illegal char " + std::to_string(temp) + " in file. Ignore it");
|
||||
}
|
||||
|
||||
auto code = (*dictionary)[temp];
|
||||
|
||||
for (auto iter = code.begin(); iter < code.end(); iter++)
|
||||
{
|
||||
// 缓冲区已经满了
|
||||
if (bufferPos == 32)
|
||||
{
|
||||
fwrite(&buffer, sizeof(int), 1, output);
|
||||
bufferPos = 0;
|
||||
buffer = 0;
|
||||
}
|
||||
|
||||
buffer = (buffer << 1) + *iter;
|
||||
bufferPos++;
|
||||
}
|
||||
}
|
||||
|
||||
metaDataT.HuffmanRoot = huffmanCode->root;
|
||||
metaDataT.HuffmanNodeLength = (int )huffmanCode->nodes->size();
|
||||
// 写入元信息
|
||||
fseek(output, 0, SEEK_SET);
|
||||
fwrite(&metaDataT, sizeof(MetaData), 1, output);
|
||||
|
||||
delete frequencyArray;
|
||||
delete huffmanCode;
|
||||
delete dictionary;
|
||||
}
|
||||
|
||||
void FileIO::WriteUnzipFile(const std::string &inputFile, const std::string &outputFile)
|
||||
{
|
||||
FILE* input = fopen(inputFile.c_str(), "rb");
|
||||
FILE* output = fopen(outputFile.c_str(), "w");
|
||||
|
||||
// 检查文件是否正常打开
|
||||
if (input == nullptr)
|
||||
{
|
||||
Logging::LoggingError(inputFile + " is not a valid file name.");
|
||||
exit(0);
|
||||
}
|
||||
if (output == nullptr)
|
||||
{
|
||||
Logging::LoggingError(outputFile + " is not a valid file name.");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// 读取元信息
|
||||
MetaData metaData{};
|
||||
fread(&metaData, sizeof(MetaData), 1, input);
|
||||
|
||||
// 读取哈夫曼节点数组
|
||||
auto nodes = new HuffmanNode[metaData.HuffmanNodeLength];
|
||||
fread(nodes, sizeof(HuffmanNode), metaData.HuffmanNodeLength, input);
|
||||
|
||||
// 读取文件的缓冲区
|
||||
int buffer;
|
||||
fread(&buffer, sizeof(int), 1, input);
|
||||
int bufferPos;
|
||||
int nextBuffer;
|
||||
|
||||
HuffmanNode node = nodes[metaData.HuffmanRoot];
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (buffer == EOF)
|
||||
{
|
||||
// 读取结束
|
||||
break;
|
||||
}
|
||||
|
||||
// 这里为了处理最后一个缓冲区的问题
|
||||
// 设置了双缓冲
|
||||
size_t readResult = fread(&nextBuffer, sizeof(int), 1, input);
|
||||
|
||||
if (readResult != 1)
|
||||
{
|
||||
// 读取到文件末尾
|
||||
nextBuffer = EOF;
|
||||
bufferPos = metaData.LastBufferUsedLength;
|
||||
}
|
||||
else
|
||||
{
|
||||
bufferPos = 32;
|
||||
}
|
||||
|
||||
while (bufferPos > 0)
|
||||
{
|
||||
if (node.data == -1)
|
||||
{
|
||||
// 非叶子节点
|
||||
int value = (buffer >> 31) & 1;
|
||||
buffer = buffer << 1;
|
||||
bufferPos--;
|
||||
|
||||
if (value == 0)
|
||||
{
|
||||
node = nodes[node.lIndex];
|
||||
}
|
||||
else
|
||||
{
|
||||
node = nodes[node.rIndex];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// 叶子节点
|
||||
fputc(node.data, output);
|
||||
node = nodes[metaData.HuffmanRoot];
|
||||
}
|
||||
}
|
||||
|
||||
buffer = nextBuffer;
|
||||
}
|
||||
|
||||
delete[] nodes;
|
||||
fclose(input);
|
||||
fclose(output);
|
||||
}
|
||||
|
||||
double FileIO::CalculateZipRate(const std::string &inputFileName, const std::string &outputFileName)
|
||||
{
|
||||
struct stat originFileStat{};
|
||||
struct stat zipFileStat{};
|
||||
|
||||
stat(inputFileName.c_str(), &originFileStat);
|
||||
stat(outputFileName.c_str(), &zipFileStat);
|
||||
|
||||
auto originFileSize = (double )originFileStat.st_size;
|
||||
auto zipFileSize = (double )zipFileStat.st_size;
|
||||
|
||||
return zipFileSize / originFileSize;
|
||||
}
|
||||
|
||||
BinaryBuffer::BinaryBuffer(std::string &inputFileName)
|
||||
{
|
||||
file = fopen(inputFileName.c_str(), "rb");
|
||||
|
||||
if (file == nullptr)
|
||||
{
|
||||
// 读取文件失败
|
||||
Logging::LoggingError(inputFileName + " is not a valid file name.");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
buffer = 0;
|
||||
bufferPos = 0;
|
||||
readFinishedFlag = false;
|
||||
|
||||
// 读取文件开头的元信息和哈夫曼数组
|
||||
MetaData metaData{};
|
||||
fread(&metaData, sizeof(MetaData), 1, file);
|
||||
position = position + (int )sizeof(MetaData) * 8;
|
||||
|
||||
// 读取哈夫曼节点数组
|
||||
HuffmanNode nodes[metaData.HuffmanNodeLength];
|
||||
fread(nodes, sizeof(HuffmanNode), metaData.HuffmanNodeLength, file);
|
||||
position = position + (int )sizeof(HuffmanNode) * metaData.HuffmanNodeLength * 8;
|
||||
}
|
||||
|
||||
BinaryBuffer::~BinaryBuffer()
|
||||
{
|
||||
fclose(file);
|
||||
file = nullptr;
|
||||
}
|
||||
|
||||
char BinaryBuffer::read()
|
||||
{
|
||||
if (readFinishedFlag)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (bufferPos == 0)
|
||||
{
|
||||
// 当前缓冲区读取结束
|
||||
int result = (int )fread(&buffer, sizeof(int), 1, file);
|
||||
|
||||
if (result == 0)
|
||||
{
|
||||
readFinishedFlag = true;
|
||||
// 文件读取结束
|
||||
return -1;
|
||||
}
|
||||
|
||||
bufferPos = 32;
|
||||
}
|
||||
|
||||
int result = (buffer >> 31) & 1;
|
||||
buffer = buffer << 1;
|
||||
bufferPos--;
|
||||
position++;
|
||||
return (char )result;
|
||||
}
|
176
zip-unzip-search/src/huffman.cpp
Normal file
176
zip-unzip-search/src/huffman.cpp
Normal file
@@ -0,0 +1,176 @@
|
||||
//
|
||||
// Created by ricardo on 22-12-11.
|
||||
//
|
||||
#include "huffman.h"
|
||||
#include "const.h"
|
||||
#include "cstdio"
|
||||
|
||||
HuffmanCode::HuffmanCode(const int *frequencyArray)
|
||||
{
|
||||
for (int i = 0; i < ASCII_LENGTH; i++)
|
||||
{
|
||||
HuffmanNode node{};
|
||||
node.data = (char )i;
|
||||
node.frequency = frequencyArray[i];
|
||||
node.id = i;
|
||||
node.lIndex = -1;
|
||||
node.rIndex = -1;
|
||||
|
||||
nodes->push_back(node);
|
||||
}
|
||||
}
|
||||
|
||||
HuffmanCode::HuffmanCode(HuffmanNode *nodeArray, int length)
|
||||
{
|
||||
delete nodes;
|
||||
|
||||
nodes = new std::vector<HuffmanNode>(nodeArray, nodeArray + length);
|
||||
}
|
||||
|
||||
HuffmanCode::~HuffmanCode()
|
||||
{
|
||||
delete nodes;
|
||||
}
|
||||
|
||||
void HuffmanCode::sortForests(std::vector<HuffmanNode> &forests)
|
||||
{
|
||||
std::size_t length = forests.size();
|
||||
bool sorted = false;
|
||||
|
||||
for (std::size_t i = 1; i < length and !sorted; i++)
|
||||
{
|
||||
sorted = true;
|
||||
|
||||
for (std::size_t j = 0; j < length - i; j++)
|
||||
{
|
||||
if (forests[j].frequency > forests[j + 1].frequency)
|
||||
{
|
||||
HuffmanNode node = forests[j];
|
||||
forests[j] = forests[j + 1];
|
||||
forests[j + 1] = node;
|
||||
sorted = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void HuffmanCode::createHuffmanTree()
|
||||
{
|
||||
auto forests = new std::vector<HuffmanNode>(*nodes);
|
||||
// 节点数组里的编号
|
||||
int pos = (*nodes).rbegin()->id + 1;
|
||||
|
||||
while (forests->size() != 1)
|
||||
{
|
||||
// 反复执行建树的过程
|
||||
sortForests(*forests);
|
||||
|
||||
HuffmanNode node{};
|
||||
node.frequency = (*forests)[0].frequency + (*forests)[1].frequency;
|
||||
node.data = -1;
|
||||
// 权值大的节点为左子结点
|
||||
// 权值小的节点为右子结点
|
||||
node.rIndex = (*forests)[0].id;
|
||||
node.lIndex = (*forests)[1].id;
|
||||
node.id = pos;
|
||||
pos++;
|
||||
nodes->push_back(node);
|
||||
|
||||
// 在森里中删除已经合并的两棵树
|
||||
// 新建一颗树
|
||||
forests->erase(forests->begin(), forests->begin() + 2);
|
||||
forests->push_back(node);
|
||||
}
|
||||
|
||||
root = forests->begin()->id;
|
||||
delete forests;
|
||||
}
|
||||
|
||||
void HuffmanCode::printHuffmanTree()
|
||||
{
|
||||
if (root == -1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
printHuffmanTreeR(root);
|
||||
}
|
||||
|
||||
void HuffmanCode::printHuffmanTreeR(int nodeId)
|
||||
{
|
||||
HuffmanNode node = (*nodes)[nodeId];
|
||||
|
||||
// 不打印权值为0的节点
|
||||
if (node.lIndex != -1 and node.frequency != 0)
|
||||
{
|
||||
printf("%d %d\n", node.id, node.lIndex);
|
||||
printHuffmanTreeR(node.lIndex);
|
||||
}
|
||||
|
||||
if (node.rIndex != -1 and node.frequency != 0)
|
||||
{
|
||||
printf("%d %d\n", node.id, node.rIndex);
|
||||
printHuffmanTreeR(node.rIndex);
|
||||
}
|
||||
}
|
||||
|
||||
std::array<std::vector<char>, 128> * HuffmanCode::getHuffmanCode()
|
||||
{
|
||||
if (root == -1)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto dictionary = new std::array<std::vector<char>, ASCII_LENGTH>();
|
||||
|
||||
std::vector<char> code;
|
||||
|
||||
getHuffmanCodeR(*dictionary, root, code);
|
||||
|
||||
return dictionary;
|
||||
}
|
||||
|
||||
void HuffmanCode::getHuffmanCodeR(std::array<std::vector<char>, ASCII_LENGTH> &dictionary, int nodeId,
|
||||
std::vector<char> &code)
|
||||
{
|
||||
HuffmanNode node = (*nodes)[nodeId];
|
||||
|
||||
if (node.data != -1)
|
||||
{
|
||||
for (auto iterator = code.begin(); iterator < code.end(); iterator++)
|
||||
{
|
||||
dictionary[node.data].push_back(*iterator);
|
||||
}
|
||||
}
|
||||
|
||||
if (node.lIndex != -1)
|
||||
{
|
||||
// 遍历左子树
|
||||
code.push_back(0);
|
||||
getHuffmanCodeR(dictionary, node.lIndex, code);
|
||||
code.pop_back();
|
||||
}
|
||||
|
||||
if (node.rIndex != -1)
|
||||
{
|
||||
// 遍历右子树
|
||||
code.push_back(1);
|
||||
getHuffmanCodeR(dictionary, node.rIndex, code);
|
||||
code.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
void HuffmanCode::printHuffmanCode(const std::array<std::vector<char>, ASCII_LENGTH>& dictionary)
|
||||
{
|
||||
for (int i = 0; i < ASCII_LENGTH; i++)
|
||||
{
|
||||
auto code = dictionary[i];
|
||||
|
||||
printf("%d: ", i);
|
||||
for (auto iter = code.begin(); iter < code.end(); iter++)
|
||||
{
|
||||
putc(*iter + 48, stdout);
|
||||
}
|
||||
putc('\n', stdout);
|
||||
}
|
||||
}
|
20
zip-unzip-search/src/logging.cpp
Normal file
20
zip-unzip-search/src/logging.cpp
Normal file
@@ -0,0 +1,20 @@
|
||||
//
|
||||
// Created by ricardo on 22-12-11.
|
||||
//
|
||||
#include "logging.h"
|
||||
#include "cstdio"
|
||||
|
||||
void Logging::LoggingInfo(const std::string &info)
|
||||
{
|
||||
printf("[Info] %s\n", info.c_str());
|
||||
}
|
||||
|
||||
void Logging::LoggingWarning(const std::string &warning)
|
||||
{
|
||||
printf("[warning] %s\n", warning.c_str());
|
||||
}
|
||||
|
||||
void Logging::LoggingError(const std::string &error)
|
||||
{
|
||||
printf("[error] %s\n", error.c_str());
|
||||
}
|
194
zip-unzip-search/src/search.cpp
Normal file
194
zip-unzip-search/src/search.cpp
Normal file
@@ -0,0 +1,194 @@
|
||||
//
|
||||
// Created by ricardo on 22-12-16.
|
||||
//
|
||||
#include "search.h"
|
||||
#include "cstdio"
|
||||
#include "cstring"
|
||||
#include "logging.h"
|
||||
#include "file_io.h"
|
||||
#include "huffman.h"
|
||||
|
||||
BMSearch::BMSearch(std::vector<char> &sample)
|
||||
{
|
||||
int length = (int )sample.size();
|
||||
|
||||
badCharArray = new int[2];
|
||||
goodSuffixArray = new int[length];
|
||||
this->sample = new std::vector<char>(sample);
|
||||
|
||||
generateBrokenCharArray(sample);
|
||||
generateGoodSuffixArray(sample);
|
||||
}
|
||||
|
||||
|
||||
BMSearch::~BMSearch()
|
||||
{
|
||||
delete badCharArray;
|
||||
delete goodSuffixArray;
|
||||
}
|
||||
|
||||
void BMSearch::generateBrokenCharArray(std::vector<char> &s)
|
||||
{
|
||||
int length = (int )s.size();
|
||||
|
||||
// 输入字符串为01串
|
||||
for (int i = 0; i < 2; i++)
|
||||
{
|
||||
badCharArray[i] = length;
|
||||
}
|
||||
|
||||
for (int i = 0; i < length - 1; i++)
|
||||
{
|
||||
badCharArray[s[i]] = length - i - 1;
|
||||
}
|
||||
}
|
||||
|
||||
void BMSearch::generateGoodSuffixArray(std::vector<char> &s)
|
||||
{
|
||||
int length = (int )s.size();
|
||||
|
||||
int suffix[length];
|
||||
|
||||
suffix[length - 1] = length;
|
||||
|
||||
for (int i = length - 2; i >= 0; i--)
|
||||
{
|
||||
int pos = i;
|
||||
while (pos >= 0 and s[pos] == s[length - 1 - i + pos])
|
||||
{
|
||||
pos--;
|
||||
}
|
||||
suffix[i] = i - pos;
|
||||
}
|
||||
|
||||
for (int i = 0; i < length; i++)
|
||||
{
|
||||
goodSuffixArray[i] = length;
|
||||
}
|
||||
|
||||
int j = 0;
|
||||
for (int i = length - 1; i >= 0 ; i--)
|
||||
{
|
||||
if (suffix[i] == i + 1)
|
||||
{
|
||||
for (; j < length - 1 - i; j++)
|
||||
{
|
||||
if (goodSuffixArray[j] == length)
|
||||
{
|
||||
goodSuffixArray[j] = length - 1 - i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < length - 1; i++)
|
||||
{
|
||||
goodSuffixArray[length - 1 - suffix[i]] = length - 1 - i;
|
||||
}
|
||||
}
|
||||
|
||||
void BMSearch::matchFile(std::string &fileName)
|
||||
{
|
||||
auto buffer = new BinaryBuffer(fileName);
|
||||
|
||||
std::vector<char> inputArray;
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (inputArray.size() != sample->size())
|
||||
{
|
||||
// bm算法要求后缀匹配
|
||||
// 所以开始之间需要读取一个长度和模式字符串长度相同的缓冲区
|
||||
char temp = buffer->read();
|
||||
if (temp == -1)
|
||||
{
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
inputArray.push_back(temp);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// 正式开始匹配
|
||||
int pos = (int )sample->size() - 1;
|
||||
for(; pos >= 0 and (*sample)[pos] == inputArray[pos]; pos--);
|
||||
if (pos < 0)
|
||||
{
|
||||
// 完成一次匹配
|
||||
Logging::LoggingInfo("Found at " + std::to_string(buffer->position));
|
||||
auto begin = inputArray.begin();
|
||||
auto end = begin + goodSuffixArray[0];
|
||||
inputArray.erase(begin, end);
|
||||
}
|
||||
else
|
||||
{
|
||||
// 匹配失败
|
||||
auto begin = inputArray.begin();
|
||||
int teleport = max(goodSuffixArray[pos],
|
||||
badCharArray[inputArray[pos]] - (int )sample->size() + 1 + pos);
|
||||
auto end = begin + teleport;
|
||||
inputArray.erase(begin, end);
|
||||
}
|
||||
}
|
||||
}
|
||||
delete buffer;
|
||||
}
|
||||
|
||||
int BMSearch::max(int a, int b)
|
||||
{
|
||||
return a >= b ? a : b;
|
||||
}
|
||||
|
||||
void SearchInFile(char* fileName, char* sample)
|
||||
{
|
||||
FILE* file = fopen(fileName, "rb");
|
||||
|
||||
if (file == nullptr)
|
||||
{
|
||||
Logging::LoggingError(std::string(fileName) + " is not a valid file name.");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// 读取元信息
|
||||
MetaData metaData{};
|
||||
fread(&metaData, sizeof(MetaData), 1, file);
|
||||
|
||||
// 读取哈夫曼节点数组
|
||||
auto nodes = new HuffmanNode[metaData.HuffmanNodeLength];
|
||||
fread(nodes, sizeof(HuffmanNode), metaData.HuffmanNodeLength, file);
|
||||
fclose(file);
|
||||
// 从哈夫曼节点数组创建哈夫曼编码
|
||||
auto huffmanCode = new HuffmanCode(nodes, metaData.HuffmanNodeLength);
|
||||
huffmanCode->root = metaData.HuffmanRoot;
|
||||
auto dictionary = huffmanCode->getHuffmanCode();
|
||||
|
||||
// 获得模板字符串的哈夫曼编码
|
||||
std::vector<char> sampleCode;
|
||||
int sampleLength = (int )strlen(sample);
|
||||
|
||||
Logging::LoggingInfo("The binary representation of " + std::string(sample) + " is ");
|
||||
for (int i = 0; i < sampleLength; i++)
|
||||
{
|
||||
auto code = (*dictionary)[sample[i]];
|
||||
for (auto iter = code.begin(); iter < code.end(); iter++)
|
||||
{
|
||||
sampleCode.push_back(*iter);
|
||||
putc(*iter + 48, stdout);
|
||||
}
|
||||
}
|
||||
putc('\n', stdout);
|
||||
|
||||
// 开始查找
|
||||
auto bm = new BMSearch(sampleCode);
|
||||
|
||||
std::string str = std::string(fileName);
|
||||
bm->matchFile(str);
|
||||
|
||||
delete bm;
|
||||
delete[] nodes;
|
||||
delete huffmanCode;
|
||||
delete dictionary;
|
||||
}
|
Reference in New Issue
Block a user