filenameMetadata.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682
  1. #include "filenameMetadata.h"
  2. #include "metadata.h"
  3. #include "stl/stringUtils.h"
  4. #include "macros.h"
  5. #include <functional>
  6. #include <algorithm>
  7. #include <string>
  8. #include <list>
  9. #include <assert.h>
  10. using namespace std;
  11. using namespace uniString;
  12. using namespace stringUtil;
  13. /*
  14. Overview of how it works:
  15. The setPattern() method looks at the pattern string and builds a stack of parseState objects.
  16. Each of these objects are responsible for finding their associated pattern within a text range.
  17. parsing is done right to left.
  18. the parseState_optional object is used to encapsulate other state objects that are
  19. optional (bracketed by [] in the pattern).
  20. */
  21. class filenameMetadata::impl
  22. {
  23. // to make unicode compatibility easier, we're just going to store things as utf32
  24. utf32 m_pattern;
  25. utf32 m_data;
  26. // put the map will be in utf 8
  27. typedef map<utf8,utf8> tokenMap_t;
  28. tokenMap_t m_tokens;
  29. class parseState;
  30. typedef list<parseState*> parseStack_t;
  31. parseStack_t m_parseStack;
  32. static void clearparseStack(parseStack_t &ps) throw()
  33. {
  34. while (!ps.empty())
  35. {
  36. delete ps.back();
  37. ps.pop_back();
  38. }
  39. }
  40. void clearparseStack() throw() { clearparseStack(m_parseStack); }
  41. ///////////// parse states //////////////////////////
  42. class parseState // virtual base
  43. {
  44. public:
  45. typedef utf32::const_reverse_iterator range_e;
  46. typedef pair<range_e, range_e> range_t;
  47. virtual ~parseState() throw() {}
  48. virtual range_t findRange(range_e rbegin, range_e rend) throw() { return make_pair(rbegin, rend); }
  49. virtual void setFromRange(range_e /*rbegin*/, range_e /*rend*/) throw() {}
  50. virtual void reportValue(tokenMap_t &/*tm*/) const throw() {}
  51. virtual utf8 describe() const throw() = 0; // for diagnostics
  52. virtual void reset() throw() {}
  53. virtual bool optional() const throw() { return false; }
  54. virtual bool finite() const throw() { return false; } // fixed width match
  55. };
  56. class parseState_optional: public parseState
  57. {
  58. parseStack_t m_parseStack;
  59. public:
  60. parseState_optional(){}
  61. ~parseState_optional() throw() { clearparseStack(m_parseStack); }
  62. parseStack_t* stack() throw() { return &m_parseStack; }
  63. virtual bool optional() const throw() { return true; }
  64. virtual void reset() throw()
  65. {
  66. for_each(m_parseStack.begin(), m_parseStack.end(), mem_fun(&parseState::reset));
  67. }
  68. virtual utf8 describe() const throw()
  69. {
  70. utf8 result("[");
  71. for (parseStack_t::const_iterator i = m_parseStack.begin(); i != m_parseStack.end(); ++i)
  72. {
  73. result += (*i)->describe();
  74. }
  75. result = result + utf8("]");
  76. return result;
  77. }
  78. virtual range_t findRange(range_e rbegin,range_e rend) throw()
  79. {
  80. const range_t NOTFOUND(make_pair(rend, rend));
  81. range_t result;
  82. reset();
  83. if (m_parseStack.empty())
  84. {
  85. return NOTFOUND;
  86. }
  87. parseStack_t::reverse_iterator s_cur = m_parseStack.rbegin();
  88. parseStack_t::reverse_iterator s_nxt = s_cur;
  89. ++s_nxt;
  90. range_e data_start = rbegin;
  91. range_e data_end = rend;
  92. range_e last_restart = rbegin;
  93. bool first(true);
  94. while (s_cur != m_parseStack.rend())
  95. {
  96. if (data_start == data_end)
  97. {
  98. reset();
  99. return NOTFOUND;
  100. }
  101. range_t curR(NOTFOUND);
  102. range_t nxtR(NOTFOUND);
  103. curR = (*s_cur)->findRange(data_start,data_end);
  104. if (curR.first == data_end)
  105. {
  106. reset();
  107. return NOTFOUND;
  108. }
  109. if ((!first) && (curR.first != data_start))
  110. {
  111. // must abut. Try moving forward again
  112. reset();
  113. s_cur = m_parseStack.rbegin();
  114. s_nxt = s_cur;
  115. ++s_nxt;
  116. ++last_restart;
  117. data_start = last_restart;
  118. first = true;
  119. continue;
  120. }
  121. if (first)
  122. {
  123. result.first = curR.first;
  124. }
  125. first = false;
  126. // don't do this if we have a single character state followed
  127. // by anything (in particular, a string which eats all
  128. if (curR.first + 1 != curR.second)
  129. {
  130. if (s_nxt != m_parseStack.rend())
  131. {
  132. nxtR = (*s_nxt)->findRange(data_start,data_end);
  133. }
  134. if (nxtR.first < curR.second)
  135. {
  136. curR.second = nxtR.first;
  137. }
  138. }
  139. (*s_cur)->setFromRange(curR.first,curR.second);
  140. s_cur = s_nxt;
  141. if (s_nxt != m_parseStack.rend())
  142. {
  143. ++s_nxt;
  144. }
  145. data_start = curR.second;
  146. }
  147. result.second = data_start;
  148. return result;
  149. }
  150. virtual void setFromRange(utf32::const_reverse_iterator rbegin, utf32::const_reverse_iterator rend) throw()
  151. {
  152. findRange(rbegin, rend); // resets to restricted range if necessary
  153. }
  154. virtual void reportValue(tokenMap_t &tm) const throw()
  155. {
  156. for (parseStack_t::const_iterator i = m_parseStack.begin(); i != m_parseStack.end(); ++i)
  157. {
  158. (*i)->reportValue(tm);
  159. }
  160. }
  161. virtual bool finite() const throw()
  162. {
  163. bool result = true;
  164. for (parseStack_t::const_iterator i = m_parseStack.begin(); i != m_parseStack.end(); ++i)
  165. {
  166. result &= (*i)->finite();
  167. }
  168. return result;
  169. }
  170. };
  171. class parseState_char: public parseState
  172. {
  173. utf32::value_type m_char;
  174. public:
  175. explicit parseState_char(utf32::value_type c) : m_char(c){}
  176. virtual pair<utf32::const_reverse_iterator,utf32::const_reverse_iterator>
  177. findRange(utf32::const_reverse_iterator rbegin,utf32::const_reverse_iterator rend) throw()
  178. {
  179. for (utf32::const_reverse_iterator i = rbegin; i != rend; ++i)
  180. {
  181. if ((*i) == m_char)
  182. {
  183. return make_pair(i, i + 1);
  184. }
  185. }
  186. return make_pair(rend,rend);
  187. }
  188. virtual utf8 describe() const throw()
  189. {
  190. utf32 u32; u32.push_back(m_char);
  191. return u32.toUtf8();
  192. }
  193. virtual bool finite() const throw() { return true; }
  194. };
  195. class parseState_stringSymbol: public parseState
  196. {
  197. utf8 m_symbolName; // can be empty for any string
  198. utf32 m_value;
  199. public:
  200. parseState_stringSymbol() throw(){}
  201. explicit parseState_stringSymbol(const string &s) throw() : m_symbolName(s){}
  202. ~parseState_stringSymbol() throw(){}
  203. void reset() throw() { m_value.clear(); }
  204. void setFromRange(utf32::const_reverse_iterator rbegin,utf32::const_reverse_iterator rend) throw()
  205. {
  206. if (!m_symbolName.empty())
  207. {
  208. m_value.clear();
  209. m_value.insert(m_value.begin(),rbegin,rend);
  210. reverse(m_value.begin(),m_value.end());
  211. m_value = stripWhitespace(m_value);
  212. }
  213. }
  214. virtual void reportValue(tokenMap_t &tm) const throw()
  215. {
  216. if (!m_symbolName.empty() && !m_value.empty())
  217. {
  218. tm[m_symbolName] = m_value.toUtf8();
  219. }
  220. }
  221. virtual utf8 describe() const throw()
  222. {
  223. if (m_symbolName.empty()) return utf8("*");
  224. return utf8("%") + m_symbolName;
  225. }
  226. };
  227. class parseState_digits: public parseState
  228. {
  229. public:
  230. parseState_digits() throw(){}
  231. virtual range_t findRange(range_e rbegin,range_e rend) throw()
  232. {
  233. range_t result(make_pair(rend,rend));
  234. bool got_start = false;
  235. for (utf32::const_reverse_iterator i = rbegin; i != rend; ++i)
  236. {
  237. if (uniString::is_a_number(*i))
  238. {
  239. if (!got_start)
  240. {
  241. got_start = true;
  242. result.first = i;
  243. }
  244. }
  245. else
  246. {
  247. if (got_start)
  248. {
  249. result.second = i;
  250. return result;
  251. }
  252. }
  253. }
  254. return result;
  255. }
  256. virtual utf8 describe() const throw() { return utf8("%#"); }
  257. };
  258. class parseState_year: public parseState
  259. {
  260. utf32 m_value;
  261. public:
  262. parseState_year() throw(){}
  263. ~parseState_year() throw(){}
  264. void reset() throw() { m_value.clear(); }
  265. virtual pair<utf32::const_reverse_iterator,utf32::const_reverse_iterator>
  266. findRange(utf32::const_reverse_iterator rbegin,utf32::const_reverse_iterator rend) throw()
  267. {
  268. int count = 4;
  269. pair<utf32::const_reverse_iterator,utf32::const_reverse_iterator> result(make_pair(rend,rend));
  270. bool got_start = false;
  271. for (utf32::const_reverse_iterator i = rbegin; i != rend; ++i)
  272. {
  273. if (uniString::is_a_number(*i))
  274. {
  275. if (!got_start)
  276. {
  277. got_start = true;
  278. result.first = i;
  279. }
  280. count -= 1;
  281. if (count == 0)
  282. {
  283. result.second = ++i;
  284. return result;
  285. }
  286. }
  287. else
  288. {
  289. if (got_start)
  290. {
  291. got_start = false;
  292. result.first = rend;
  293. }
  294. }
  295. }
  296. return make_pair(rend,rend);
  297. }
  298. void setFromRange(utf32::const_reverse_iterator rbegin,utf32::const_reverse_iterator rend) throw()
  299. {
  300. m_value.clear();
  301. m_value.insert(m_value.begin(),rbegin,rend);
  302. reverse(m_value.begin(),m_value.end());
  303. }
  304. virtual void reportValue(tokenMap_t &tm) const throw()
  305. {
  306. if (!m_value.empty())
  307. {
  308. tm[utf8(metadata::YEAR())] = m_value.toUtf8();
  309. }
  310. }
  311. virtual utf8 describe() const throw() { return utf8("%YEAR"); }
  312. virtual bool finite() const throw() { return true; }
  313. };
  314. class parseState_fixed: public parseState
  315. {
  316. utf32 m_value;
  317. public:
  318. // fixed string
  319. explicit parseState_fixed(const utf32 &val) throw() : m_value(val) {}
  320. ~parseState_fixed() throw(){}
  321. virtual pair<utf32::const_reverse_iterator,utf32::const_reverse_iterator>
  322. findRange(utf32::const_reverse_iterator rbegin,utf32::const_reverse_iterator rend) throw()
  323. {
  324. assert(!m_value.empty());
  325. if (m_value.empty()) return make_pair(rend,rend);
  326. for (utf32::const_reverse_iterator i = rbegin; i != rend; ++i)
  327. {
  328. if ((*i) == (*(m_value.rbegin())))
  329. {
  330. utf32::const_reverse_iterator t_i = i;
  331. utf32::const_reverse_iterator v_i = m_value.rbegin();
  332. utf32::const_reverse_iterator v_i_end = m_value.rend();
  333. bool match(true);
  334. for (; match && (v_i != v_i_end); ++t_i, ++v_i)
  335. {
  336. if ((t_i == rend) || ((*t_i) != (*v_i)))
  337. {
  338. match = false;
  339. }
  340. }
  341. if (match)
  342. {
  343. return make_pair(i, i + m_value.size());
  344. }
  345. }
  346. }
  347. return make_pair(rend,rend);
  348. }
  349. virtual utf8 describe() const throw()
  350. {
  351. return m_value.toUtf8();
  352. }
  353. virtual bool finite() const throw() { return true; }
  354. };
  355. static string stringify(utf32::value_type v) throw()
  356. {
  357. if (v >= '0' && v <= 'z') return string(1,(string::value_type)v);
  358. return tos((int)v);
  359. }
  360. public:
  361. impl(){}
  362. ~impl() throw()
  363. {
  364. clearparseStack();
  365. }
  366. void deleteToken(const utf8 &token) throw()
  367. {
  368. tokenMap_t::iterator i = m_tokens.find(token);
  369. if (i != m_tokens.end()) m_tokens.erase(i);
  370. }
  371. const tokenMap_t::size_type countTokens() const throw() { return m_tokens.size(); }
  372. utf8& operator[](const utf8 &key) throw() { return m_tokens[key]; }
  373. const map<utf8,utf8>& getTokens() const throw() { return m_tokens; }
  374. void setPattern(const utf8 &pattern) throw(runtime_error)
  375. {
  376. parseState_optional *opt = 0;
  377. parseStack_t *stack = &m_parseStack;
  378. try
  379. {
  380. utf32 fixedAccumulator; // fixed string value
  381. #define DUMPACCUMULATOR { if (!fixedAccumulator.empty()) { stack->push_back(new parseState_fixed(fixedAccumulator)); fixedAccumulator.clear(); } }
  382. clearparseStack();
  383. m_pattern.assign(pattern);
  384. for (utf32::const_iterator i = m_pattern.begin(); i != m_pattern.end(); ++i)
  385. {
  386. if ((*i) == ']')
  387. {
  388. DUMPACCUMULATOR
  389. if (!opt) throw runtime_error("Unmatched ']' in pattern");
  390. stack = &m_parseStack;
  391. stack->push_back(opt);
  392. opt = 0;
  393. }
  394. else if ((*i) == '[')
  395. {
  396. DUMPACCUMULATOR
  397. if (opt) throw runtime_error("Optional sequences cannot be nested in pattern");
  398. opt = new parseState_optional;
  399. stack = opt->stack();
  400. }
  401. else if ((*i) == '%')
  402. {
  403. ++i;
  404. if (i == m_pattern.end()) throw runtime_error("Bad pattern. Trailing %");
  405. switch (*i)
  406. {
  407. case 'N': DUMPACCUMULATOR stack->push_back(new parseState_stringSymbol(metadata::NAME())); break;
  408. case 'G': DUMPACCUMULATOR stack->push_back(new parseState_stringSymbol(metadata::GENRE())); break;
  409. case 'A': DUMPACCUMULATOR stack->push_back(new parseState_stringSymbol(metadata::ALBUM())); break;
  410. case 'R': DUMPACCUMULATOR stack->push_back(new parseState_stringSymbol(metadata::ARTIST()));break;
  411. case 'Y': DUMPACCUMULATOR stack->push_back(new parseState_year); break;
  412. case '#': DUMPACCUMULATOR stack->push_back(new parseState_digits); break;
  413. case '%': fixedAccumulator.push_back('%'); break;
  414. default: throw runtime_error("Unknown symbol %" + stringify(*i));
  415. }
  416. }
  417. else if ((*i) == '*')
  418. {
  419. DUMPACCUMULATOR
  420. stack->push_back(new parseState_stringSymbol);
  421. }
  422. else
  423. {
  424. fixedAccumulator.push_back(*i);
  425. }
  426. }
  427. if (opt)
  428. {
  429. throw runtime_error("Unterminated optional sequence in pattern");
  430. }
  431. DUMPACCUMULATOR
  432. }
  433. catch(...)
  434. {
  435. delete opt;
  436. throw;
  437. }
  438. }
  439. static utf8 describeStackRange(parseStack_t::const_reverse_iterator begin,parseStack_t::const_reverse_iterator end) throw()
  440. {
  441. parseStack_t stck(begin,end);
  442. reverse(stck.begin(),stck.end());
  443. utf8 result;
  444. for (parseStack_t::const_iterator i = stck.begin(); i != stck.end(); ++i)
  445. {
  446. result = result + (*i)->describe();
  447. }
  448. return result;
  449. }
  450. static utf8 describeRemainingData(utf32::const_reverse_iterator begin,utf32::const_reverse_iterator end) throw()
  451. {
  452. utf32 u32(begin,end);
  453. reverse(u32.begin(),u32.end());
  454. return u32.toUtf8();
  455. }
  456. void parse(const utf8 &data) throw(runtime_error)
  457. {
  458. m_data.assign(data);
  459. // beginning and end of data string
  460. utf32::const_reverse_iterator data_start = m_data.rbegin();
  461. utf32::const_reverse_iterator data_end = m_data.rend();
  462. // current and next object pointers from the parse stack
  463. parseStack_t::reverse_iterator s_cur = m_parseStack.rbegin();
  464. parseStack_t::reverse_iterator s_nxt = s_cur;
  465. ++s_nxt;
  466. while(s_cur != m_parseStack.rend())
  467. {
  468. // if we haven't finished the parse stack, and we're out of data then it's an error
  469. if (data_start == data_end)
  470. {
  471. throw runtime_error("Premature end of data (" + describeStackRange(s_cur,m_parseStack.rend()).hideAsString() + ")");
  472. }
  473. // we do one lookahead. Get the range match for the current parse object and
  474. // the next parse object. Note that there is some added complexity due to optional objects
  475. pair<utf32::const_reverse_iterator,utf32::const_reverse_iterator> curR(make_pair(data_end,data_end));
  476. pair<utf32::const_reverse_iterator,utf32::const_reverse_iterator> nxtR(make_pair(data_end,data_end));
  477. // find widest possible match for current state
  478. curR = (*s_cur)->findRange(data_start,data_end);
  479. // if no match, and the object is optional, just move on to the next (continue)
  480. if ((curR.first == data_end) && (*s_cur)->optional())
  481. {
  482. s_cur = s_nxt;
  483. if (s_nxt != m_parseStack.rend())
  484. {
  485. ++s_nxt;
  486. }
  487. continue;
  488. }
  489. // if no match, but object is not optional, then we have an error
  490. if (curR.first == data_end)
  491. {
  492. throw runtime_error("Parse error, symbol not found (" + describeStackRange(s_cur,m_parseStack.rend()).hideAsString() + ") (" + describeRemainingData(data_start,data_end).hideAsString() + ")");
  493. }
  494. // if match was not found at our current starting point, then we have an error
  495. if (curR.first != data_start)
  496. {
  497. throw runtime_error("Parse error, data skipped to find symbol (" + describeStackRange(s_cur,m_parseStack.rend()).hideAsString() + ") (" + describeRemainingData(data_start,data_end).hideAsString() + ")");
  498. }
  499. // restrict match range by one lookahead. Do not do lookahead
  500. // if our current state is a single character match
  501. if (!(*s_cur)->finite()) //curR.first + 1 != curR.second)
  502. {
  503. // we must loop in case the followup objects are optional and we must
  504. // continue to look ahead
  505. while (true)
  506. {
  507. if (s_nxt == m_parseStack.rend()) break;
  508. // to handle the case of two optional string elements in a row, we
  509. // repeat this if the range of the current and follow up objects match by
  510. // incrementing the start
  511. nxtR = (*s_nxt)->findRange(data_start,data_end);
  512. if ((nxtR.first == curR.first) && (!(*s_nxt)->finite()))
  513. {
  514. nxtR = (*s_nxt)->findRange(data_start+1,data_end);
  515. }
  516. if (nxtR.first < curR.second)
  517. {
  518. // lookahead object restricts range
  519. curR.second = nxtR.first;
  520. break;
  521. }
  522. if ((nxtR.first == data_end) && (nxtR.second == data_end) && (*s_nxt)->optional())
  523. {
  524. // lookahead object not found and is optional. try the next
  525. ++s_nxt;
  526. }
  527. else
  528. {
  529. // no restriction
  530. break;
  531. }
  532. }
  533. }
  534. // set value and advance to next parse object
  535. (*s_cur)->setFromRange(curR.first,curR.second);
  536. s_cur = s_nxt;
  537. if (s_nxt != m_parseStack.rend())
  538. {
  539. ++s_nxt;
  540. }
  541. data_start = curR.second;
  542. }
  543. if (data_start != data_end)
  544. {
  545. throw runtime_error("Data extends beyond pattern (" + describeRemainingData(data_start,data_end).hideAsString() + ")");
  546. }
  547. m_tokens.clear();
  548. for (parseStack_t::const_iterator i = m_parseStack.begin(); i != m_parseStack.end(); ++i)
  549. {
  550. (*i)->reportValue(m_tokens);
  551. }
  552. }
  553. };
  554. //////////////////////////////////////////////////////////////////////////////
  555. //////////////////////////////////////////////////////////////////////////////
  556. filenameMetadata::filenameMetadata(): m_impl(0)
  557. {
  558. m_impl = new filenameMetadata::impl;
  559. }
  560. filenameMetadata::~filenameMetadata() throw()
  561. {
  562. forget(m_impl);
  563. }
  564. void filenameMetadata::setPattern(const utf8 &pattern) throw(exception)
  565. {
  566. assert(m_impl);
  567. if (!m_impl) throw logic_error(string(__FUNCTION__) + " internal impl object is null");
  568. m_impl->setPattern(pattern);
  569. }
  570. void filenameMetadata::parse(const utf8 &data) throw(exception)
  571. {
  572. assert(m_impl);
  573. if (!m_impl) throw logic_error(string(__FUNCTION__) + " internal impl object is null");
  574. m_impl->parse(data);
  575. }
  576. void filenameMetadata::deleteToken(const utf8 &token) throw(exception)
  577. {
  578. assert(m_impl);
  579. if (!m_impl) throw logic_error(string(__FUNCTION__) + " internal impl object is null");
  580. m_impl->deleteToken(token);
  581. }
  582. const size_t filenameMetadata::countTokens() throw(exception)
  583. {
  584. assert(m_impl);
  585. if (!m_impl) throw logic_error(string(__FUNCTION__) + " internal impl object is null");
  586. return m_impl->countTokens();
  587. }
  588. utf8& filenameMetadata::operator[](const utf8 &key) throw(exception)
  589. {
  590. assert(m_impl);
  591. if (!m_impl) throw logic_error(string(__FUNCTION__) + " internal impl object is null");
  592. return m_impl->operator[](key);
  593. }
  594. const map<utf8,utf8>& filenameMetadata::getTokens() const throw(exception)
  595. {
  596. assert(m_impl);
  597. if (!m_impl) throw logic_error(string(__FUNCTION__) + " internal impl object is null");
  598. return m_impl->getTokens();
  599. }