1 | #include <iostream> |
2 | #include <optional> |
3 | #include <boost/program_options.hpp> |
4 | #include <boost/algorithm/string/join.hpp> |
5 | |
6 | #include <Common/Exception.h> |
7 | #include <IO/WriteBufferFromFileDescriptor.h> |
8 | #include <IO/ReadBufferFromFileDescriptor.h> |
9 | #include <Compression/CompressedWriteBuffer.h> |
10 | #include <Compression/CompressedReadBuffer.h> |
11 | #include <IO/WriteHelpers.h> |
12 | #include <IO/copyData.h> |
13 | #include <Parsers/parseQuery.h> |
14 | #include <Parsers/ExpressionElementParsers.h> |
15 | #include <Compression/CompressionFactory.h> |
16 | #include <Common/TerminalSize.h> |
17 | |
18 | |
19 | namespace DB |
20 | { |
21 | namespace ErrorCodes |
22 | { |
23 | extern const int TOO_LARGE_SIZE_COMPRESSED; |
24 | extern const int BAD_ARGUMENTS; |
25 | } |
26 | } |
27 | |
28 | |
29 | namespace |
30 | { |
31 | |
32 | /// Outputs sizes of uncompressed and compressed blocks for compressed file. |
33 | void checkAndWriteHeader(DB::ReadBuffer & in, DB::WriteBuffer & out) |
34 | { |
35 | while (!in.eof()) |
36 | { |
37 | in.ignore(16); /// checksum |
38 | |
39 | char [COMPRESSED_BLOCK_HEADER_SIZE]; |
40 | in.readStrict(header, COMPRESSED_BLOCK_HEADER_SIZE); |
41 | |
42 | UInt32 size_compressed = unalignedLoad<UInt32>(&header[1]); |
43 | |
44 | if (size_compressed > DBMS_MAX_COMPRESSED_SIZE) |
45 | throw DB::Exception("Too large size_compressed. Most likely corrupted data." , DB::ErrorCodes::TOO_LARGE_SIZE_COMPRESSED); |
46 | |
47 | UInt32 size_decompressed = unalignedLoad<UInt32>(&header[5]); |
48 | |
49 | DB::writeText(size_decompressed, out); |
50 | DB::writeChar('\t', out); |
51 | DB::writeText(size_compressed, out); |
52 | DB::writeChar('\n', out); |
53 | |
54 | in.ignore(size_compressed - COMPRESSED_BLOCK_HEADER_SIZE); |
55 | } |
56 | } |
57 | |
58 | } |
59 | |
60 | #pragma GCC diagnostic ignored "-Wunused-function" |
61 | #pragma GCC diagnostic ignored "-Wmissing-declarations" |
62 | |
63 | int mainEntryClickHouseCompressor(int argc, char ** argv) |
64 | { |
65 | boost::program_options::options_description desc = createOptionsDescription("Allowed options" , getTerminalWidth()); |
66 | desc.add_options() |
67 | ("help,h" , "produce help message" ) |
68 | ("decompress,d" , "decompress" ) |
69 | ("block-size,b" , boost::program_options::value<unsigned>()->default_value(DBMS_DEFAULT_BUFFER_SIZE), "compress in blocks of specified size" ) |
70 | ("hc" , "use LZ4HC instead of LZ4" ) |
71 | ("zstd" , "use ZSTD instead of LZ4" ) |
72 | ("codec" , boost::program_options::value<std::vector<std::string>>()->multitoken(), "use codecs combination instead of LZ4" ) |
73 | ("level" , boost::program_options::value<int>(), "compression level for codecs spicified via flags" ) |
74 | ("none" , "use no compression instead of LZ4" ) |
75 | ("stat" , "print block statistics of compressed data" ) |
76 | ; |
77 | |
78 | boost::program_options::variables_map options; |
79 | boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options); |
80 | |
81 | if (options.count("help" )) |
82 | { |
83 | std::cout << "Usage: " << argv[0] << " [options] < in > out" << std::endl; |
84 | std::cout << desc << std::endl; |
85 | return 1; |
86 | } |
87 | |
88 | try |
89 | { |
90 | bool decompress = options.count("decompress" ); |
91 | bool use_lz4hc = options.count("hc" ); |
92 | bool use_zstd = options.count("zstd" ); |
93 | bool stat_mode = options.count("stat" ); |
94 | bool use_none = options.count("none" ); |
95 | unsigned block_size = options["block-size" ].as<unsigned>(); |
96 | std::vector<std::string> codecs; |
97 | if (options.count("codec" )) |
98 | codecs = options["codec" ].as<std::vector<std::string>>(); |
99 | |
100 | if ((use_lz4hc || use_zstd || use_none) && !codecs.empty()) |
101 | throw DB::Exception("Wrong options, codec flags like --zstd and --codec options are mutually exclusive" , DB::ErrorCodes::BAD_ARGUMENTS); |
102 | |
103 | if (!codecs.empty() && options.count("level" )) |
104 | throw DB::Exception("Wrong options, --level is not compatible with --codec list" , DB::ErrorCodes::BAD_ARGUMENTS); |
105 | |
106 | std::string method_family = "LZ4" ; |
107 | |
108 | if (use_lz4hc) |
109 | method_family = "LZ4HC" ; |
110 | else if (use_zstd) |
111 | method_family = "ZSTD" ; |
112 | else if (use_none) |
113 | method_family = "NONE" ; |
114 | |
115 | std::optional<int> level = std::nullopt; |
116 | if (options.count("level" )) |
117 | level = options["level" ].as<int>(); |
118 | |
119 | |
120 | DB::CompressionCodecPtr codec; |
121 | if (!codecs.empty()) |
122 | { |
123 | DB::ParserCodec codec_parser; |
124 | |
125 | std::string codecs_line = boost::algorithm::join(codecs, "," ); |
126 | auto ast = DB::parseQuery(codec_parser, "(" + codecs_line + ")" , 0); |
127 | codec = DB::CompressionCodecFactory::instance().get(ast, nullptr); |
128 | } |
129 | else |
130 | codec = DB::CompressionCodecFactory::instance().get(method_family, level); |
131 | |
132 | |
133 | DB::ReadBufferFromFileDescriptor rb(STDIN_FILENO); |
134 | DB::WriteBufferFromFileDescriptor wb(STDOUT_FILENO); |
135 | |
136 | if (stat_mode) |
137 | { |
138 | /// Output statistic for compressed file. |
139 | checkAndWriteHeader(rb, wb); |
140 | } |
141 | else if (decompress) |
142 | { |
143 | /// Decompression |
144 | DB::CompressedReadBuffer from(rb); |
145 | DB::copyData(from, wb); |
146 | } |
147 | else |
148 | { |
149 | /// Compression |
150 | DB::CompressedWriteBuffer to(wb, codec, block_size); |
151 | DB::copyData(rb, to); |
152 | } |
153 | } |
154 | catch (...) |
155 | { |
156 | std::cerr << DB::getCurrentExceptionMessage(true); |
157 | return DB::getCurrentExceptionCode(); |
158 | } |
159 | |
160 | return 0; |
161 | } |
162 | |