demangle-rust.c 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. #include <string.h>
  2. #include "util.h"
  3. #include "debug.h"
  4. #include "demangle-rust.h"
  5. /*
  6. * Mangled Rust symbols look like this:
  7. *
  8. * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
  9. *
  10. * The original symbol is:
  11. *
  12. * <std::sys::fd::FileDesc as core::ops::Drop>::drop
  13. *
  14. * The last component of the path is a 64-bit hash in lowercase hex, prefixed
  15. * with "h". Rust does not have a global namespace between crates, an illusion
  16. * which Rust maintains by using the hash to distinguish things that would
  17. * otherwise have the same symbol.
  18. *
  19. * Any path component not starting with a XID_Start character is prefixed with
  20. * "_".
  21. *
  22. * The following escape sequences are used:
  23. *
  24. * "," => $C$
  25. * "@" => $SP$
  26. * "*" => $BP$
  27. * "&" => $RF$
  28. * "<" => $LT$
  29. * ">" => $GT$
  30. * "(" => $LP$
  31. * ")" => $RP$
  32. * " " => $u20$
  33. * "'" => $u27$
  34. * "[" => $u5b$
  35. * "]" => $u5d$
  36. * "~" => $u7e$
  37. *
  38. * A double ".." means "::" and a single "." means "-".
  39. *
  40. * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
  41. */
  42. static const char *hash_prefix = "::h";
  43. static const size_t hash_prefix_len = 3;
  44. static const size_t hash_len = 16;
  45. static bool is_prefixed_hash(const char *start);
  46. static bool looks_like_rust(const char *sym, size_t len);
  47. static bool unescape(const char **in, char **out, const char *seq, char value);
  48. /*
  49. * INPUT:
  50. * sym: symbol that has been through BFD-demangling
  51. *
  52. * This function looks for the following indicators:
  53. *
  54. * 1. The hash must consist of "h" followed by 16 lowercase hex digits.
  55. *
  56. * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
  57. * hex digits. This is true of 99.9998% of hashes so once in your life you
  58. * may see a false negative. The point is to notice path components that
  59. * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
  60. * this case a false positive (non-Rust symbol has an important path
  61. * component removed because it looks like a Rust hash) is worse than a
  62. * false negative (the rare Rust symbol is not demangled) so this sets the
  63. * balance in favor of false negatives.
  64. *
  65. * 3. There must be no characters other than a-zA-Z0-9 and _.:$
  66. *
  67. * 4. There must be no unrecognized $-sign sequences.
  68. *
  69. * 5. There must be no sequence of three or more dots in a row ("...").
  70. */
  71. bool
  72. rust_is_mangled(const char *sym)
  73. {
  74. size_t len, len_without_hash;
  75. if (!sym)
  76. return false;
  77. len = strlen(sym);
  78. if (len <= hash_prefix_len + hash_len)
  79. /* Not long enough to contain "::h" + hash + something else */
  80. return false;
  81. len_without_hash = len - (hash_prefix_len + hash_len);
  82. if (!is_prefixed_hash(sym + len_without_hash))
  83. return false;
  84. return looks_like_rust(sym, len_without_hash);
  85. }
  86. /*
  87. * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
  88. * digits must comprise between 5 and 15 (inclusive) distinct digits.
  89. */
  90. static bool is_prefixed_hash(const char *str)
  91. {
  92. const char *end;
  93. bool seen[16];
  94. size_t i;
  95. int count;
  96. if (strncmp(str, hash_prefix, hash_prefix_len))
  97. return false;
  98. str += hash_prefix_len;
  99. memset(seen, false, sizeof(seen));
  100. for (end = str + hash_len; str < end; str++)
  101. if (*str >= '0' && *str <= '9')
  102. seen[*str - '0'] = true;
  103. else if (*str >= 'a' && *str <= 'f')
  104. seen[*str - 'a' + 10] = true;
  105. else
  106. return false;
  107. /* Count how many distinct digits seen */
  108. count = 0;
  109. for (i = 0; i < 16; i++)
  110. if (seen[i])
  111. count++;
  112. return count >= 5 && count <= 15;
  113. }
  114. static bool looks_like_rust(const char *str, size_t len)
  115. {
  116. const char *end = str + len;
  117. while (str < end)
  118. switch (*str) {
  119. case '$':
  120. if (!strncmp(str, "$C$", 3))
  121. str += 3;
  122. else if (!strncmp(str, "$SP$", 4)
  123. || !strncmp(str, "$BP$", 4)
  124. || !strncmp(str, "$RF$", 4)
  125. || !strncmp(str, "$LT$", 4)
  126. || !strncmp(str, "$GT$", 4)
  127. || !strncmp(str, "$LP$", 4)
  128. || !strncmp(str, "$RP$", 4))
  129. str += 4;
  130. else if (!strncmp(str, "$u20$", 5)
  131. || !strncmp(str, "$u27$", 5)
  132. || !strncmp(str, "$u5b$", 5)
  133. || !strncmp(str, "$u5d$", 5)
  134. || !strncmp(str, "$u7e$", 5))
  135. str += 5;
  136. else
  137. return false;
  138. break;
  139. case '.':
  140. /* Do not allow three or more consecutive dots */
  141. if (!strncmp(str, "...", 3))
  142. return false;
  143. /* Fall through */
  144. case 'a' ... 'z':
  145. case 'A' ... 'Z':
  146. case '0' ... '9':
  147. case '_':
  148. case ':':
  149. str++;
  150. break;
  151. default:
  152. return false;
  153. }
  154. return true;
  155. }
  156. /*
  157. * INPUT:
  158. * sym: symbol for which rust_is_mangled(sym) returns true
  159. *
  160. * The input is demangled in-place because the mangled name is always longer
  161. * than the demangled one.
  162. */
  163. void
  164. rust_demangle_sym(char *sym)
  165. {
  166. const char *in;
  167. char *out;
  168. const char *end;
  169. if (!sym)
  170. return;
  171. in = sym;
  172. out = sym;
  173. end = sym + strlen(sym) - (hash_prefix_len + hash_len);
  174. while (in < end)
  175. switch (*in) {
  176. case '$':
  177. if (!(unescape(&in, &out, "$C$", ',')
  178. || unescape(&in, &out, "$SP$", '@')
  179. || unescape(&in, &out, "$BP$", '*')
  180. || unescape(&in, &out, "$RF$", '&')
  181. || unescape(&in, &out, "$LT$", '<')
  182. || unescape(&in, &out, "$GT$", '>')
  183. || unescape(&in, &out, "$LP$", '(')
  184. || unescape(&in, &out, "$RP$", ')')
  185. || unescape(&in, &out, "$u20$", ' ')
  186. || unescape(&in, &out, "$u27$", '\'')
  187. || unescape(&in, &out, "$u5b$", '[')
  188. || unescape(&in, &out, "$u5d$", ']')
  189. || unescape(&in, &out, "$u7e$", '~'))) {
  190. pr_err("demangle-rust: unexpected escape sequence");
  191. goto done;
  192. }
  193. break;
  194. case '_':
  195. /*
  196. * If this is the start of a path component and the next
  197. * character is an escape sequence, ignore the
  198. * underscore. The mangler inserts an underscore to make
  199. * sure the path component begins with a XID_Start
  200. * character.
  201. */
  202. if ((in == sym || in[-1] == ':') && in[1] == '$')
  203. in++;
  204. else
  205. *out++ = *in++;
  206. break;
  207. case '.':
  208. if (in[1] == '.') {
  209. /* ".." becomes "::" */
  210. *out++ = ':';
  211. *out++ = ':';
  212. in += 2;
  213. } else {
  214. /* "." becomes "-" */
  215. *out++ = '-';
  216. in++;
  217. }
  218. break;
  219. case 'a' ... 'z':
  220. case 'A' ... 'Z':
  221. case '0' ... '9':
  222. case ':':
  223. *out++ = *in++;
  224. break;
  225. default:
  226. pr_err("demangle-rust: unexpected character '%c' in symbol\n",
  227. *in);
  228. goto done;
  229. }
  230. done:
  231. *out = '\0';
  232. }
  233. static bool unescape(const char **in, char **out, const char *seq, char value)
  234. {
  235. size_t len = strlen(seq);
  236. if (strncmp(*in, seq, len))
  237. return false;
  238. **out = value;
  239. *in += len;
  240. *out += 1;
  241. return true;
  242. }