1 | //===--- SystemIncludeExtractor.cpp ------------------------------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // Some compiler drivers have implicit search mechanism for system headers. |
9 | // This compilation database implementation tries to extract that information by |
10 | // executing the driver in verbose mode. gcc-compatible drivers print something |
11 | // like: |
12 | // .... |
13 | // .... |
14 | // #include <...> search starts here: |
15 | // /usr/lib/gcc/x86_64-linux-gnu/7/include |
16 | // /usr/local/include |
17 | // /usr/lib/gcc/x86_64-linux-gnu/7/include-fixed |
18 | // /usr/include/x86_64-linux-gnu |
19 | // /usr/include |
20 | // End of search list. |
21 | // .... |
22 | // .... |
23 | // This component parses that output and adds each path to command line args |
24 | // provided by Base, after prepending them with -isystem. Therefore current |
25 | // implementation would not work with a driver that is not gcc-compatible. |
26 | // |
27 | // First argument of the command line received from underlying compilation |
28 | // database is used as compiler driver path. Due to this arbitrary binary |
29 | // execution, this mechanism is not used by default and only executes binaries |
30 | // in the paths that are explicitly included by the user. |
31 | |
32 | #include "CompileCommands.h" |
33 | #include "GlobalCompilationDatabase.h" |
34 | #include "support/Logger.h" |
35 | #include "support/Threading.h" |
36 | #include "support/Trace.h" |
37 | #include "clang/Basic/Diagnostic.h" |
38 | #include "clang/Basic/DiagnosticIDs.h" |
39 | #include "clang/Basic/DiagnosticOptions.h" |
40 | #include "clang/Basic/TargetInfo.h" |
41 | #include "clang/Basic/TargetOptions.h" |
42 | #include "clang/Driver/Types.h" |
43 | #include "clang/Tooling/CompilationDatabase.h" |
44 | #include "llvm/ADT/ArrayRef.h" |
45 | #include "llvm/ADT/DenseMap.h" |
46 | #include "llvm/ADT/Hashing.h" |
47 | #include "llvm/ADT/IntrusiveRefCntPtr.h" |
48 | #include "llvm/ADT/STLExtras.h" |
49 | #include "llvm/ADT/ScopeExit.h" |
50 | #include "llvm/ADT/SmallString.h" |
51 | #include "llvm/ADT/SmallVector.h" |
52 | #include "llvm/ADT/StringExtras.h" |
53 | #include "llvm/ADT/StringRef.h" |
54 | #include "llvm/Support/ErrorHandling.h" |
55 | #include "llvm/Support/FileSystem.h" |
56 | #include "llvm/Support/MemoryBuffer.h" |
57 | #include "llvm/Support/Path.h" |
58 | #include "llvm/Support/Program.h" |
59 | #include "llvm/Support/Regex.h" |
60 | #include "llvm/Support/ScopedPrinter.h" |
61 | #include "llvm/Support/raw_ostream.h" |
62 | #include <cassert> |
63 | #include <cstddef> |
64 | #include <iterator> |
65 | #include <memory> |
66 | #include <optional> |
67 | #include <string> |
68 | #include <tuple> |
69 | #include <utility> |
70 | #include <vector> |
71 | |
72 | namespace clang::clangd { |
73 | namespace { |
74 | |
75 | struct DriverInfo { |
76 | std::vector<std::string> SystemIncludes; |
77 | std::string Target; |
78 | }; |
79 | |
80 | struct DriverArgs { |
81 | // Name of the driver program to execute or absolute path to it. |
82 | std::string Driver; |
83 | // Whether certain includes should be part of query. |
84 | bool StandardIncludes = true; |
85 | bool StandardCXXIncludes = true; |
86 | bool BuiltinIncludes = true; |
87 | // Language to use while querying. |
88 | std::string Lang; |
89 | std::string Sysroot; |
90 | std::string ISysroot; |
91 | |
92 | bool operator==(const DriverArgs &RHS) const { |
93 | return std::tie(Driver, StandardIncludes, StandardCXXIncludes, |
94 | BuiltinIncludes, Lang, Sysroot, ISysroot) == |
95 | std::tie(RHS.Driver, RHS.StandardIncludes, RHS.StandardCXXIncludes, |
96 | RHS.BuiltinIncludes, RHS.Lang, RHS.Sysroot, ISysroot); |
97 | } |
98 | |
99 | DriverArgs(const tooling::CompileCommand &Cmd, llvm::StringRef File) { |
100 | llvm::SmallString<128> Driver(Cmd.CommandLine.front()); |
101 | // Driver is a not a single executable name but instead a path (either |
102 | // relative or absolute). |
103 | if (llvm::any_of(Driver, |
104 | [](char C) { return llvm::sys::path::is_separator(C); })) { |
105 | llvm::sys::fs::make_absolute(Cmd.Directory, Driver); |
106 | } |
107 | this->Driver = Driver.str().str(); |
108 | for (size_t I = 0, E = Cmd.CommandLine.size(); I < E; ++I) { |
109 | llvm::StringRef Arg = Cmd.CommandLine[I]; |
110 | |
111 | // Look for Language related flags. |
112 | if (Arg.consume_front("-x" )) { |
113 | if (Arg.empty() && I + 1 < E) |
114 | Lang = Cmd.CommandLine[I + 1]; |
115 | else |
116 | Lang = Arg.str(); |
117 | } |
118 | // Look for standard/builtin includes. |
119 | else if (Arg == "-nostdinc" || Arg == "--no-standard-includes" ) |
120 | StandardIncludes = false; |
121 | else if (Arg == "-nostdinc++" ) |
122 | StandardCXXIncludes = false; |
123 | else if (Arg == "-nobuiltininc" ) |
124 | BuiltinIncludes = false; |
125 | // Figure out sysroot |
126 | else if (Arg.consume_front("--sysroot" )) { |
127 | if (Arg.consume_front("=" )) |
128 | Sysroot = Arg.str(); |
129 | else if (Arg.empty() && I + 1 < E) |
130 | Sysroot = Cmd.CommandLine[I + 1]; |
131 | } else if (Arg.consume_front("-isysroot" )) { |
132 | if (Arg.empty() && I + 1 < E) |
133 | ISysroot = Cmd.CommandLine[I + 1]; |
134 | else |
135 | ISysroot = Arg.str(); |
136 | } |
137 | } |
138 | |
139 | // Downgrade objective-c++-header (used in clangd's fallback flags for .h |
140 | // files) to c++-header, as some drivers may fail to run the extraction |
141 | // command if it contains `-xobjective-c++-header` and objective-c++ support |
142 | // is not installed. |
143 | // In practice, we don't see different include paths for the two on |
144 | // clang+mac, which is the most common objectve-c compiler. |
145 | if (Lang == "objective-c++-header" ) { |
146 | Lang = "c++-header" ; |
147 | } |
148 | |
149 | // If language is not explicit in the flags, infer from the file. |
150 | // This is important as we want to cache each language separately. |
151 | if (Lang.empty()) { |
152 | llvm::StringRef Ext = llvm::sys::path::extension(File).trim('.'); |
153 | auto Type = driver::types::lookupTypeForExtension(Ext); |
154 | if (Type == driver::types::TY_INVALID) { |
155 | elog("System include extraction: invalid file type for {0}" , Ext); |
156 | } else { |
157 | Lang = driver::types::getTypeName(Type); |
158 | } |
159 | } |
160 | } |
161 | llvm::SmallVector<llvm::StringRef> render() const { |
162 | // FIXME: Don't treat lang specially? |
163 | assert(!Lang.empty()); |
164 | llvm::SmallVector<llvm::StringRef> Args = {"-x" , Lang}; |
165 | if (!StandardIncludes) |
166 | Args.push_back("-nostdinc" ); |
167 | if (!StandardCXXIncludes) |
168 | Args.push_back("-nostdinc++" ); |
169 | if (!BuiltinIncludes) |
170 | Args.push_back("-nobuiltininc++" ); |
171 | if (!Sysroot.empty()) |
172 | Args.append({"--sysroot" , Sysroot}); |
173 | if (!ISysroot.empty()) |
174 | Args.append({"-isysroot" , ISysroot}); |
175 | return Args; |
176 | } |
177 | |
178 | static DriverArgs getEmpty() { return {}; } |
179 | |
180 | private: |
181 | DriverArgs() = default; |
182 | }; |
183 | } // namespace |
184 | } // namespace clang::clangd |
185 | namespace llvm { |
186 | using DriverArgs = clang::clangd::DriverArgs; |
187 | template <> struct DenseMapInfo<DriverArgs> { |
188 | static DriverArgs getEmptyKey() { |
189 | auto Driver = DriverArgs::getEmpty(); |
190 | Driver.Driver = "EMPTY_KEY" ; |
191 | return Driver; |
192 | } |
193 | static DriverArgs getTombstoneKey() { |
194 | auto Driver = DriverArgs::getEmpty(); |
195 | Driver.Driver = "TOMBSTONE_KEY" ; |
196 | return Driver; |
197 | } |
198 | static unsigned getHashValue(const DriverArgs &Val) { |
199 | return llvm::hash_value(std::tuple{ |
200 | Val.Driver, |
201 | Val.StandardIncludes, |
202 | Val.StandardCXXIncludes, |
203 | Val.BuiltinIncludes, |
204 | Val.Lang, |
205 | Val.Sysroot, |
206 | Val.ISysroot, |
207 | }); |
208 | } |
209 | static bool isEqual(const DriverArgs &LHS, const DriverArgs &RHS) { |
210 | return LHS == RHS; |
211 | } |
212 | }; |
213 | } // namespace llvm |
214 | namespace clang::clangd { |
215 | namespace { |
216 | bool isValidTarget(llvm::StringRef Triple) { |
217 | std::shared_ptr<TargetOptions> TargetOpts(new TargetOptions); |
218 | TargetOpts->Triple = Triple.str(); |
219 | DiagnosticsEngine Diags(new DiagnosticIDs, new DiagnosticOptions, |
220 | new IgnoringDiagConsumer); |
221 | llvm::IntrusiveRefCntPtr<TargetInfo> Target = |
222 | TargetInfo::CreateTargetInfo(Diags, TargetOpts); |
223 | return bool(Target); |
224 | } |
225 | |
226 | std::optional<DriverInfo> parseDriverOutput(llvm::StringRef Output) { |
227 | DriverInfo Info; |
228 | const char SIS[] = "#include <...> search starts here:" ; |
229 | const char SIE[] = "End of search list." ; |
230 | const char TS[] = "Target: " ; |
231 | llvm::SmallVector<llvm::StringRef> Lines; |
232 | Output.split(Lines, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/false); |
233 | |
234 | enum { |
235 | Initial, // Initial state: searching for target or includes list. |
236 | , // Includes extracting. |
237 | Done // Includes and target extraction done. |
238 | } State = Initial; |
239 | bool SeenIncludes = false; |
240 | bool SeenTarget = false; |
241 | for (auto *It = Lines.begin(); State != Done && It != Lines.end(); ++It) { |
242 | auto Line = *It; |
243 | switch (State) { |
244 | case Initial: |
245 | if (!SeenIncludes && Line.trim() == SIS) { |
246 | SeenIncludes = true; |
247 | State = IncludesExtracting; |
248 | } else if (!SeenTarget && Line.trim().startswith(TS)) { |
249 | SeenTarget = true; |
250 | llvm::StringRef TargetLine = Line.trim(); |
251 | TargetLine.consume_front(TS); |
252 | // Only detect targets that clang understands |
253 | if (!isValidTarget(TargetLine)) { |
254 | elog("System include extraction: invalid target \"{0}\", ignoring" , |
255 | TargetLine); |
256 | } else { |
257 | Info.Target = TargetLine.str(); |
258 | vlog("System include extraction: target extracted: \"{0}\"" , |
259 | TargetLine); |
260 | } |
261 | } |
262 | break; |
263 | case IncludesExtracting: |
264 | if (Line.trim() == SIE) { |
265 | State = SeenTarget ? Done : Initial; |
266 | } else { |
267 | Info.SystemIncludes.push_back(Line.trim().str()); |
268 | vlog("System include extraction: adding {0}" , Line); |
269 | } |
270 | break; |
271 | default: |
272 | llvm_unreachable("Impossible state of the driver output parser" ); |
273 | break; |
274 | } |
275 | } |
276 | if (!SeenIncludes) { |
277 | elog("System include extraction: start marker not found: {0}" , Output); |
278 | return std::nullopt; |
279 | } |
280 | if (State == IncludesExtracting) { |
281 | elog("System include extraction: end marker missing: {0}" , Output); |
282 | return std::nullopt; |
283 | } |
284 | return std::move(Info); |
285 | } |
286 | |
287 | std::optional<DriverInfo> |
288 | extractSystemIncludesAndTarget(const DriverArgs &InputArgs, |
289 | const llvm::Regex &QueryDriverRegex) { |
290 | trace::Span Tracer("Extract system includes and target" ); |
291 | |
292 | std::string Driver = InputArgs.Driver; |
293 | if (!llvm::sys::path::is_absolute(Driver)) { |
294 | auto DriverProgram = llvm::sys::findProgramByName(Driver); |
295 | if (DriverProgram) { |
296 | vlog("System include extraction: driver {0} expanded to {1}" , Driver, |
297 | *DriverProgram); |
298 | Driver = *DriverProgram; |
299 | } else { |
300 | elog("System include extraction: driver {0} not found in PATH" , Driver); |
301 | return std::nullopt; |
302 | } |
303 | } |
304 | |
305 | SPAN_ATTACH(Tracer, "driver" , Driver); |
306 | SPAN_ATTACH(Tracer, "lang" , InputArgs.Lang); |
307 | |
308 | if (!QueryDriverRegex.match(Driver)) { |
309 | vlog("System include extraction: not allowed driver {0}" , Driver); |
310 | return std::nullopt; |
311 | } |
312 | |
313 | llvm::SmallString<128> StdErrPath; |
314 | if (auto EC = llvm::sys::fs::createTemporaryFile("system-includes" , "clangd" , |
315 | StdErrPath)) { |
316 | elog("System include extraction: failed to create temporary file with " |
317 | "error {0}" , |
318 | EC.message()); |
319 | return std::nullopt; |
320 | } |
321 | auto CleanUp = llvm::make_scope_exit( |
322 | [&StdErrPath]() { llvm::sys::fs::remove(StdErrPath); }); |
323 | |
324 | std::optional<llvm::StringRef> Redirects[] = {{"" }, {"" }, StdErrPath.str()}; |
325 | |
326 | llvm::SmallVector<llvm::StringRef> Args = {Driver, "-E" , "-v" }; |
327 | Args.append(InputArgs.render()); |
328 | // Input needs to go after Lang flags. |
329 | Args.push_back("-" ); |
330 | |
331 | std::string ErrMsg; |
332 | if (int RC = llvm::sys::ExecuteAndWait(Driver, Args, /*Env=*/std::nullopt, |
333 | Redirects, /*SecondsToWait=*/0, |
334 | /*MemoryLimit=*/0, &ErrMsg)) { |
335 | elog("System include extraction: driver execution failed with return code: " |
336 | "{0} - '{1}'. Args: [{2}]" , |
337 | llvm::to_string(RC), ErrMsg, printArgv(Args)); |
338 | return std::nullopt; |
339 | } |
340 | |
341 | auto BufOrError = llvm::MemoryBuffer::getFile(StdErrPath); |
342 | if (!BufOrError) { |
343 | elog("System include extraction: failed to read {0} with error {1}" , |
344 | StdErrPath, BufOrError.getError().message()); |
345 | return std::nullopt; |
346 | } |
347 | |
348 | std::optional<DriverInfo> Info = |
349 | parseDriverOutput(BufOrError->get()->getBuffer()); |
350 | if (!Info) |
351 | return std::nullopt; |
352 | log("System includes extractor: successfully executed {0}\n\tgot includes: " |
353 | "\"{1}\"\n\tgot target: \"{2}\"" , |
354 | Driver, llvm::join(Info->SystemIncludes, ", " ), Info->Target); |
355 | return Info; |
356 | } |
357 | |
358 | tooling::CompileCommand & |
359 | addSystemIncludes(tooling::CompileCommand &Cmd, |
360 | llvm::ArrayRef<std::string> SystemIncludes) { |
361 | std::vector<std::string> ToAppend; |
362 | for (llvm::StringRef Include : SystemIncludes) { |
363 | // FIXME(kadircet): This doesn't work when we have "--driver-mode=cl" |
364 | ToAppend.push_back("-isystem" ); |
365 | ToAppend.push_back(Include.str()); |
366 | } |
367 | if (!ToAppend.empty()) { |
368 | // Just append when `--` isn't present. |
369 | auto InsertAt = llvm::find(Cmd.CommandLine, "--" ); |
370 | Cmd.CommandLine.insert(InsertAt, std::make_move_iterator(ToAppend.begin()), |
371 | std::make_move_iterator(ToAppend.end())); |
372 | } |
373 | return Cmd; |
374 | } |
375 | |
376 | tooling::CompileCommand &setTarget(tooling::CompileCommand &Cmd, |
377 | const std::string &Target) { |
378 | if (!Target.empty()) { |
379 | // We do not want to override existing target with extracted one. |
380 | for (llvm::StringRef Arg : Cmd.CommandLine) { |
381 | if (Arg == "-target" || Arg.startswith("--target=" )) |
382 | return Cmd; |
383 | } |
384 | // Just append when `--` isn't present. |
385 | auto InsertAt = llvm::find(Cmd.CommandLine, "--" ); |
386 | Cmd.CommandLine.insert(InsertAt, "--target=" + Target); |
387 | } |
388 | return Cmd; |
389 | } |
390 | |
391 | /// Converts a glob containing only ** or * into a regex. |
392 | std::string convertGlobToRegex(llvm::StringRef Glob) { |
393 | std::string RegText; |
394 | llvm::raw_string_ostream RegStream(RegText); |
395 | RegStream << '^'; |
396 | for (size_t I = 0, E = Glob.size(); I < E; ++I) { |
397 | if (Glob[I] == '*') { |
398 | if (I + 1 < E && Glob[I + 1] == '*') { |
399 | // Double star, accept any sequence. |
400 | RegStream << ".*" ; |
401 | // Also skip the second star. |
402 | ++I; |
403 | } else { |
404 | // Single star, accept any sequence without a slash. |
405 | RegStream << "[^/]*" ; |
406 | } |
407 | } else if (llvm::sys::path::is_separator(Glob[I]) && |
408 | llvm::sys::path::is_separator('/') && |
409 | llvm::sys::path::is_separator('\\')) { |
410 | RegStream << R"([/\\])" ; // Accept either slash on windows. |
411 | } else { |
412 | RegStream << llvm::Regex::escape(Glob.substr(I, 1)); |
413 | } |
414 | } |
415 | RegStream << '$'; |
416 | RegStream.flush(); |
417 | return RegText; |
418 | } |
419 | |
420 | /// Converts a glob containing only ** or * into a regex. |
421 | llvm::Regex convertGlobsToRegex(llvm::ArrayRef<std::string> Globs) { |
422 | assert(!Globs.empty() && "Globs cannot be empty!" ); |
423 | std::vector<std::string> RegTexts; |
424 | RegTexts.reserve(Globs.size()); |
425 | for (llvm::StringRef Glob : Globs) |
426 | RegTexts.push_back(convertGlobToRegex(Glob)); |
427 | |
428 | // Tempting to pass IgnoreCase, but we don't know the FS sensitivity. |
429 | llvm::Regex Reg(llvm::join(RegTexts, "|" )); |
430 | assert(Reg.isValid(RegTexts.front()) && |
431 | "Created an invalid regex from globs" ); |
432 | return Reg; |
433 | } |
434 | |
435 | /// Extracts system includes from a trusted driver by parsing the output of |
436 | /// include search path and appends them to the commands coming from underlying |
437 | /// compilation database. |
438 | class { |
439 | public: |
440 | (llvm::ArrayRef<std::string> QueryDriverGlobs) |
441 | : QueryDriverRegex(convertGlobsToRegex(QueryDriverGlobs)) {} |
442 | |
443 | void operator()(tooling::CompileCommand &Cmd, llvm::StringRef File) const { |
444 | if (Cmd.CommandLine.empty()) |
445 | return; |
446 | |
447 | DriverArgs Args(Cmd, File); |
448 | if (Args.Lang.empty()) |
449 | return; |
450 | if (auto Info = QueriedDrivers.get(Args, [&] { |
451 | return extractSystemIncludesAndTarget(Args, QueryDriverRegex); |
452 | })) { |
453 | setTarget(addSystemIncludes(Cmd, Info->SystemIncludes), Info->Target); |
454 | } |
455 | } |
456 | |
457 | private: |
458 | // Caches includes extracted from a driver. Key is driver:lang. |
459 | Memoize<llvm::DenseMap<DriverArgs, std::optional<DriverInfo>>> ; |
460 | llvm::Regex ; |
461 | }; |
462 | } // namespace |
463 | |
464 | SystemIncludeExtractorFn |
465 | (llvm::ArrayRef<std::string> QueryDriverGlobs) { |
466 | if (QueryDriverGlobs.empty()) |
467 | return nullptr; |
468 | return SystemIncludeExtractor(QueryDriverGlobs); |
469 | } |
470 | |
471 | } // namespace clang::clangd |
472 | |