demangle-rust.c 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <string.h>
  3. #include "util.h"
  4. #include "debug.h"
  5. #include "demangle-rust.h"
  6. /*
  7. * Mangled Rust symbols look like this:
  8. *
  9. * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
  10. *
  11. * The original symbol is:
  12. *
  13. * <std::sys::fd::FileDesc as core::ops::Drop>::drop
  14. *
  15. * The last component of the path is a 64-bit hash in lowercase hex, prefixed
  16. * with "h". Rust does not have a global namespace between crates, an illusion
  17. * which Rust maintains by using the hash to distinguish things that would
  18. * otherwise have the same symbol.
  19. *
  20. * Any path component not starting with a XID_Start character is prefixed with
  21. * "_".
  22. *
  23. * The following escape sequences are used:
  24. *
  25. * "," => $C$
  26. * "@" => $SP$
  27. * "*" => $BP$
  28. * "&" => $RF$
  29. * "<" => $LT$
  30. * ">" => $GT$
  31. * "(" => $LP$
  32. * ")" => $RP$
  33. * " " => $u20$
  34. * "'" => $u27$
  35. * "[" => $u5b$
  36. * "]" => $u5d$
  37. * "~" => $u7e$
  38. *
  39. * A double ".." means "::" and a single "." means "-".
  40. *
  41. * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
  42. */
  43. static const char *hash_prefix = "::h";
  44. static const size_t hash_prefix_len = 3;
  45. static const size_t hash_len = 16;
  46. static bool is_prefixed_hash(const char *start);
  47. static bool looks_like_rust(const char *sym, size_t len);
  48. static bool unescape(const char **in, char **out, const char *seq, char value);
  49. /*
  50. * INPUT:
  51. * sym: symbol that has been through BFD-demangling
  52. *
  53. * This function looks for the following indicators:
  54. *
  55. * 1. The hash must consist of "h" followed by 16 lowercase hex digits.
  56. *
  57. * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
  58. * hex digits. This is true of 99.9998% of hashes so once in your life you
  59. * may see a false negative. The point is to notice path components that
  60. * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
  61. * this case a false positive (non-Rust symbol has an important path
  62. * component removed because it looks like a Rust hash) is worse than a
  63. * false negative (the rare Rust symbol is not demangled) so this sets the
  64. * balance in favor of false negatives.
  65. *
  66. * 3. There must be no characters other than a-zA-Z0-9 and _.:$
  67. *
  68. * 4. There must be no unrecognized $-sign sequences.
  69. *
  70. * 5. There must be no sequence of three or more dots in a row ("...").
  71. */
  72. bool
  73. rust_is_mangled(const char *sym)
  74. {
  75. size_t len, len_without_hash;
  76. if (!sym)
  77. return false;
  78. len = strlen(sym);
  79. if (len <= hash_prefix_len + hash_len)
  80. /* Not long enough to contain "::h" + hash + something else */
  81. return false;
  82. len_without_hash = len - (hash_prefix_len + hash_len);
  83. if (!is_prefixed_hash(sym + len_without_hash))
  84. return false;
  85. return looks_like_rust(sym, len_without_hash);
  86. }
  87. /*
  88. * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
  89. * digits must comprise between 5 and 15 (inclusive) distinct digits.
  90. */
  91. static bool is_prefixed_hash(const char *str)
  92. {
  93. const char *end;
  94. bool seen[16];
  95. size_t i;
  96. int count;
  97. if (strncmp(str, hash_prefix, hash_prefix_len))
  98. return false;
  99. str += hash_prefix_len;
  100. memset(seen, false, sizeof(seen));
  101. for (end = str + hash_len; str < end; str++)
  102. if (*str >= '0' && *str <= '9')
  103. seen[*str - '0'] = true;
  104. else if (*str >= 'a' && *str <= 'f')
  105. seen[*str - 'a' + 10] = true;
  106. else
  107. return false;
  108. /* Count how many distinct digits seen */
  109. count = 0;
  110. for (i = 0; i < 16; i++)
  111. if (seen[i])
  112. count++;
  113. return count >= 5 && count <= 15;
  114. }
  115. static bool looks_like_rust(const char *str, size_t len)
  116. {
  117. const char *end = str + len;
  118. while (str < end)
  119. switch (*str) {
  120. case '$':
  121. if (!strncmp(str, "$C$", 3))
  122. str += 3;
  123. else if (!strncmp(str, "$SP$", 4)
  124. || !strncmp(str, "$BP$", 4)
  125. || !strncmp(str, "$RF$", 4)
  126. || !strncmp(str, "$LT$", 4)
  127. || !strncmp(str, "$GT$", 4)
  128. || !strncmp(str, "$LP$", 4)
  129. || !strncmp(str, "$RP$", 4))
  130. str += 4;
  131. else if (!strncmp(str, "$u20$", 5)
  132. || !strncmp(str, "$u27$", 5)
  133. || !strncmp(str, "$u5b$", 5)
  134. || !strncmp(str, "$u5d$", 5)
  135. || !strncmp(str, "$u7e$", 5))
  136. str += 5;
  137. else
  138. return false;
  139. break;
  140. case '.':
  141. /* Do not allow three or more consecutive dots */
  142. if (!strncmp(str, "...", 3))
  143. return false;
  144. /* Fall through */
  145. case 'a' ... 'z':
  146. case 'A' ... 'Z':
  147. case '0' ... '9':
  148. case '_':
  149. case ':':
  150. str++;
  151. break;
  152. default:
  153. return false;
  154. }
  155. return true;
  156. }
  157. /*
  158. * INPUT:
  159. * sym: symbol for which rust_is_mangled(sym) returns true
  160. *
  161. * The input is demangled in-place because the mangled name is always longer
  162. * than the demangled one.
  163. */
  164. void
  165. rust_demangle_sym(char *sym)
  166. {
  167. const char *in;
  168. char *out;
  169. const char *end;
  170. if (!sym)
  171. return;
  172. in = sym;
  173. out = sym;
  174. end = sym + strlen(sym) - (hash_prefix_len + hash_len);
  175. while (in < end)
  176. switch (*in) {
  177. case '$':
  178. if (!(unescape(&in, &out, "$C$", ',')
  179. || unescape(&in, &out, "$SP$", '@')
  180. || unescape(&in, &out, "$BP$", '*')
  181. || unescape(&in, &out, "$RF$", '&')
  182. || unescape(&in, &out, "$LT$", '<')
  183. || unescape(&in, &out, "$GT$", '>')
  184. || unescape(&in, &out, "$LP$", '(')
  185. || unescape(&in, &out, "$RP$", ')')
  186. || unescape(&in, &out, "$u20$", ' ')
  187. || unescape(&in, &out, "$u27$", '\'')
  188. || unescape(&in, &out, "$u5b$", '[')
  189. || unescape(&in, &out, "$u5d$", ']')
  190. || unescape(&in, &out, "$u7e$", '~'))) {
  191. pr_err("demangle-rust: unexpected escape sequence");
  192. goto done;
  193. }
  194. break;
  195. case '_':
  196. /*
  197. * If this is the start of a path component and the next
  198. * character is an escape sequence, ignore the
  199. * underscore. The mangler inserts an underscore to make
  200. * sure the path component begins with a XID_Start
  201. * character.
  202. */
  203. if ((in == sym || in[-1] == ':') && in[1] == '$')
  204. in++;
  205. else
  206. *out++ = *in++;
  207. break;
  208. case '.':
  209. if (in[1] == '.') {
  210. /* ".." becomes "::" */
  211. *out++ = ':';
  212. *out++ = ':';
  213. in += 2;
  214. } else {
  215. /* "." becomes "-" */
  216. *out++ = '-';
  217. in++;
  218. }
  219. break;
  220. case 'a' ... 'z':
  221. case 'A' ... 'Z':
  222. case '0' ... '9':
  223. case ':':
  224. *out++ = *in++;
  225. break;
  226. default:
  227. pr_err("demangle-rust: unexpected character '%c' in symbol\n",
  228. *in);
  229. goto done;
  230. }
  231. done:
  232. *out = '\0';
  233. }
  234. static bool unescape(const char **in, char **out, const char *seq, char value)
  235. {
  236. size_t len = strlen(seq);
  237. if (strncmp(*in, seq, len))
  238. return false;
  239. **out = value;
  240. *in += len;
  241. *out += 1;
  242. return true;
  243. }