xdelta3-regtest.py 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265
  1. #!/usr/bin/python2.7
  2. # xdelta3 - delta compression tools and library -*- Mode: C++ -*-
  3. # Copyright 2016 Joshua MacDonald
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. # TODO This code is no longer maintained :(
  17. import os, sys, math, re, time, types, array, random
  18. import xdelta3
  19. RCSDIR = '/tmp/rcs'
  20. SAMPLEDIR = "/tmp/diff"
  21. #
  22. MIN_SIZE = 0
  23. TIME_TOO_SHORT = 0.050
  24. SKIP_TRIALS = 2
  25. MIN_TRIALS = 3
  26. MAX_TRIALS = 15
  27. # 10 = fast 1.5 = slow
  28. MIN_STDDEV_PCT = 1.5
  29. # How many results per round
  30. MAX_RESULTS = 500
  31. TEST_ROUNDS = 10
  32. KEEP_P = (0.5)
  33. # For RCS testing, what percent to select
  34. FILE_P = (0.50)
  35. # For run-speed tests
  36. MIN_RUN = 1000 * 1000 * 1
  37. MAX_RUN = 1000 * 1000 * 10
  38. # Testwide defaults
  39. ALL_ARGS = [
  40. '-q' # '-vv'
  41. ]
  42. # The first 7 args go to -C
  43. SOFT_CONFIG_CNT = 7
  44. CONFIG_ORDER = [ 'large_look',
  45. 'large_step',
  46. 'small_look',
  47. 'small_chain',
  48. 'small_lchain',
  49. 'max_lazy',
  50. 'long_enough',
  51. # > SOFT_CONFIG_CNT
  52. 'nocompress',
  53. 'winsize',
  54. 'srcwinsize',
  55. 'sprevsz',
  56. 'iopt',
  57. 'djw',
  58. 'altcode',
  59. ]
  60. CONFIG_ARGMAP = {
  61. 'winsize' : '-W',
  62. 'srcwinsize' : '-B',
  63. 'sprevsz' : '-P',
  64. 'iopt' : '-I',
  65. 'nocompress' : '-N',
  66. 'djw' : '-Sdjw',
  67. 'altcode' : '-T',
  68. }
  69. def INPUT_SPEC(rand):
  70. return {
  71. # Time/space costs:
  72. # -C 1,2,3,4,5,6,7
  73. 'large_look' : lambda d: rand.choice([9, 10, 11, 12]),
  74. 'large_step' : lambda d: rand.choice([25, 26, 27, 28, 29, 30]),
  75. 'small_look' : lambda d: rand.choice([4]),
  76. 'small_chain' : lambda d: rand.choice([1]),
  77. 'small_lchain' : lambda d: rand.choice([1]),
  78. 'max_lazy' : lambda d: rand.choice([4, 5, 6, 7, 8, 9, 10 ]),
  79. # Note: long_enough only refers to small matching and has no effect if
  80. # small_chain == 1.
  81. 'long_enough' : lambda d: rand.choice([4]),
  82. # -N
  83. 'nocompress' : lambda d: rand.choice(['false']),
  84. # -T
  85. 'altcode' : lambda d: rand.choice(['false']),
  86. # -S djw
  87. 'djw' : lambda d: rand.choice(['false']),
  88. # Memory costs:
  89. # -W
  90. 'winsize' : lambda d: 8 * (1<<20),
  91. # -B
  92. 'srcwinsize' : lambda d: 64 * (1<<20),
  93. # -I 0 is unlimited
  94. 'iopt' : lambda d: 0,
  95. # -P only powers of two
  96. 'sprevsz' : lambda d: rand.choice([x * (1<<16) for x in [4]]),
  97. }
  98. #end
  99. #
  100. TMPDIR = '/tmp/xd3regtest.%d' % os.getpid()
  101. RUNFILE = os.path.join(TMPDIR, 'run')
  102. DFILE = os.path.join(TMPDIR, 'output')
  103. RFILE = os.path.join(TMPDIR, 'recon')
  104. CMPTMP1 = os.path.join(TMPDIR, 'cmptmp1')
  105. CMPTMP2 = os.path.join(TMPDIR, 'cmptmp2')
  106. HEAD_STATE = 0
  107. BAR_STATE = 1
  108. REV_STATE = 2
  109. DATE_STATE = 3
  110. #
  111. IGNORE_FILENAME = re.compile('.*\\.(gif|jpg).*')
  112. # rcs output
  113. RE_TOTREV = re.compile('total revisions: (\\d+)')
  114. RE_BAR = re.compile('----------------------------')
  115. RE_REV = re.compile('revision (.+)')
  116. RE_DATE = re.compile('date: ([^;]+);.*')
  117. # xdelta output
  118. RE_HDRSZ = re.compile('VCDIFF header size: +(\\d+)')
  119. RE_EXTCOMP = re.compile('XDELTA ext comp.*')
  120. def c2str(c):
  121. return ' '.join(['%s' % x for x in c])
  122. #end
  123. def SumList(l):
  124. return reduce(lambda x,y: x+y, l)
  125. #end
  126. # returns (total, mean, stddev, q2 (median),
  127. # (q3-q1)/2 ("semi-interquartile range"), max-min (spread))
  128. class StatList:
  129. def __init__(self,l,desc):
  130. cnt = len(l)
  131. assert(cnt > 1)
  132. l.sort()
  133. self.cnt = cnt
  134. self.l = l
  135. self.total = SumList(l)
  136. self.mean = self.total / float(self.cnt)
  137. self.s = math.sqrt(SumList([(x-self.mean) *
  138. (x - self.mean) for x in l]) /
  139. float(self.cnt-1))
  140. self.q0 = l[0]
  141. self.q1 = l[int(self.cnt/4.0+0.5)]
  142. self.q2 = l[int(self.cnt/2.0+0.5)]
  143. self.q3 = l[min(self.cnt-1,int((3.0*self.cnt)/4.0+0.5))]
  144. self.q4 = l[self.cnt-1]
  145. self.siqr = (self.q3-self.q1)/2.0;
  146. self.spread = (self.q4-self.q0)
  147. if len(l) == 1:
  148. self.str = '%s %s' % (desc, l[0])
  149. else:
  150. self.str = '%s mean %.1f: 25%-ile %d %d %d %d %d' % \
  151. (desc, self.mean, self.q0, self.q1, self.q2, self.q3, self.q4)
  152. #end
  153. #end
  154. def RunCommand(args, ok = [0]):
  155. #print 'run command %s' % (' '.join(args))
  156. p = os.spawnvp(os.P_WAIT, args[0], args)
  157. if p not in ok:
  158. raise CommandError(args, 'exited %d' % p)
  159. #end
  160. #end
  161. def RunCommandIO(args,infn,outfn):
  162. p = os.fork()
  163. if p == 0:
  164. os.dup2(os.open(infn,os.O_RDONLY),0)
  165. os.dup2(os.open(outfn,os.O_CREAT|os.O_TRUNC|os.O_WRONLY),1)
  166. os.execvp(args[0], args)
  167. else:
  168. s = os.waitpid(p,0)
  169. o = os.WEXITSTATUS(s[1])
  170. if not os.WIFEXITED(s[1]) or o != 0:
  171. raise CommandError(args, 'exited %d' % o)
  172. #end
  173. #end
  174. #end
  175. class TimedTest:
  176. def __init__(self, target, source, runnable,
  177. skip_trials = SKIP_TRIALS,
  178. min_trials = MIN_TRIALS,
  179. max_trials = MAX_TRIALS,
  180. min_stddev_pct = MIN_STDDEV_PCT):
  181. self.target = target
  182. self.source = source
  183. self.runnable = runnable
  184. self.skip_trials = skip_trials
  185. self.min_trials = min(min_trials, max_trials)
  186. self.max_trials = max_trials
  187. self.min_stddev_pct = min_stddev_pct
  188. self.encode_time = self.DoTest(DFILE,
  189. lambda x: x.Encode(self.target,
  190. self.source, DFILE))
  191. self.encode_size = runnable.EncodeSize(DFILE)
  192. self.decode_time = self.DoTest(RFILE,
  193. lambda x: x.Decode(DFILE,
  194. self.source, RFILE),
  195. )
  196. runnable.Verify(self.target, RFILE)
  197. #end
  198. def DoTest(self, fname, func):
  199. trials = 0
  200. measured = []
  201. while 1:
  202. try:
  203. os.remove(fname)
  204. except OSError:
  205. pass
  206. start_time = time.time()
  207. start_clock = time.clock()
  208. func(self.runnable)
  209. total_clock = (time.clock() - start_clock)
  210. total_time = (time.time() - start_time)
  211. elap_time = max(total_time, 0.0000001)
  212. elap_clock = max(total_clock, 0.0000001)
  213. trials = trials + 1
  214. # skip some of the first trials
  215. if trials > self.skip_trials:
  216. measured.append((elap_clock, elap_time))
  217. #print 'measurement total: %.1f ms' % (total_time * 1000.0)
  218. # at least so many
  219. if trials < (self.skip_trials + self.min_trials):
  220. #print 'continue: need more trials: %d' % trials
  221. continue
  222. # compute %variance
  223. done = 0
  224. if self.skip_trials + self.min_trials <= 2:
  225. measured = measured + measured;
  226. done = 1
  227. #end
  228. time_stat = StatList([x[1] for x in measured], 'elap time')
  229. sp = float(time_stat.s) / float(time_stat.mean)
  230. # what if MAX_TRIALS is exceeded?
  231. too_many = (trials - self.skip_trials) >= self.max_trials
  232. good = (100.0 * sp) < self.min_stddev_pct
  233. if done or too_many or good:
  234. trials = trials - self.skip_trials
  235. if not done and not good:
  236. #print 'too many trials: %d' % trials
  237. pass
  238. #clock = StatList([x[0] for x in measured], 'elap clock')
  239. return time_stat
  240. #end
  241. #end
  242. #end
  243. #end
  244. def Decimals(start, end):
  245. l = []
  246. step = start
  247. while 1:
  248. r = range(step, step * 10, step)
  249. l = l + r
  250. if step * 10 >= end:
  251. l.append(step * 10)
  252. break
  253. step = step * 10
  254. return l
  255. #end
  256. # This tests the raw speed of 0-byte inputs
  257. def RunSpeedTest():
  258. for L in Decimals(MIN_RUN, MAX_RUN):
  259. SetFileSize(RUNFILE, L)
  260. trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<20)]))
  261. ReportSpeed(L, trx, '1MB ')
  262. trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<19)]))
  263. ReportSpeed(L, trx, '512k')
  264. trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<18)]))
  265. ReportSpeed(L, trx, '256k')
  266. trm = TimedTest(RUNFILE, None, Xdelta3Mod1(RUNFILE))
  267. ReportSpeed(L, trm, 'swig')
  268. trg = TimedTest(RUNFILE, None, GzipRun1())
  269. ReportSpeed(L,trg,'gzip')
  270. #end
  271. #end
  272. def SetFileSize(F,L):
  273. fd = os.open(F, os.O_CREAT | os.O_WRONLY)
  274. os.ftruncate(fd,L)
  275. assert os.fstat(fd).st_size == L
  276. os.close(fd)
  277. #end
  278. def ReportSpeed(L,tr,desc):
  279. print '%s run length %u: size %u: time %.3f ms: decode %.3f ms' % \
  280. (desc, L,
  281. tr.encode_size,
  282. tr.encode_time.mean * 1000.0,
  283. tr.decode_time.mean * 1000.0)
  284. #end
  285. class Xdelta3RunClass:
  286. def __init__(self, extra):
  287. self.extra = extra
  288. #end
  289. def __str__(self):
  290. return ' '.join(self.extra)
  291. #end
  292. def New(self):
  293. return Xdelta3Runner(self.extra)
  294. #end
  295. #end
  296. class Xdelta3Runner:
  297. # Use "forkexec" to get special command-line only features like
  298. # external compression support.
  299. def __init__(self, extra, forkexec=False):
  300. self.forkexec = forkexec
  301. self.extra = extra
  302. #end
  303. def Encode(self, target, source, output):
  304. args = (ALL_ARGS +
  305. self.extra +
  306. ['-e'])
  307. if source:
  308. args.append('-s')
  309. args.append(source)
  310. #end
  311. args = args + [target, output]
  312. self.Main(args)
  313. #end
  314. def Decode(self, input, source, output):
  315. args = (ALL_ARGS +
  316. ['-d'])
  317. if source:
  318. args.append('-s')
  319. args.append(source)
  320. #end
  321. args = args + [input, output]
  322. self.Main(args)
  323. #end
  324. def Verify(self, target, recon):
  325. if target[-3:] == ".gz":
  326. RunCommandIO(('gzip', '-dc'), target, CMPTMP1)
  327. RunCommandIO(('gzip', '-dc'), recon, CMPTMP2)
  328. RunCommand(('cmp', CMPTMP1, CMPTMP2))
  329. else:
  330. RunCommand(('cmp', target, recon))
  331. #end
  332. def EncodeSize(self, output):
  333. return os.stat(output).st_size
  334. #end
  335. def Main(self, args):
  336. try:
  337. if self.forkexec:
  338. RunCommand(['../xdelta3'] + args)
  339. else:
  340. xdelta3.xd3_main_cmdline(args)
  341. except Exception, e:
  342. raise CommandError(args, "xdelta3.main exception: %s" % e)
  343. #end
  344. #end
  345. #end
  346. class Xdelta3Mod1:
  347. def __init__(self, file):
  348. self.target_data = open(file, 'r').read()
  349. #end
  350. def Encode(self, ignore1, ignore2, ignore3):
  351. r1, encoded = xdelta3.xd3_encode_memory(self.target_data, None, 1000000, 1<<10)
  352. if r1 != 0:
  353. raise CommandError('memory', 'encode failed: %s' % r1)
  354. #end
  355. self.encoded = encoded
  356. #end
  357. def Decode(self, ignore1, ignore2, ignore3):
  358. r2, data1 = xdelta3.xd3_decode_memory(self.encoded, None, len(self.target_data))
  359. if r2 != 0:
  360. raise CommandError('memory', 'decode failed: %s' % r1)
  361. #end
  362. self.decoded = data1
  363. #end
  364. def Verify(self, ignore1, ignore2):
  365. if self.target_data != self.decoded:
  366. raise CommandError('memory', 'bad decode')
  367. #end
  368. #end
  369. def EncodeSize(self, ignore1):
  370. return len(self.encoded)
  371. #end
  372. #end
  373. class GzipRun1:
  374. def Encode(self, target, source, output):
  375. assert source == None
  376. RunCommandIO(['gzip', '-cf'], target, output)
  377. #end
  378. def Decode(self, input, source, output):
  379. assert source == None
  380. RunCommandIO(['gzip', '-dcf'], input, output)
  381. #end
  382. def Verify(self, target, recon):
  383. RunCommand(('cmp', target, recon))
  384. #end
  385. def EncodeSize(self, output):
  386. return os.stat(output).st_size
  387. #end
  388. #end
  389. class Xdelta1RunClass:
  390. def __str__(self):
  391. return 'xdelta1'
  392. #end
  393. def New(self):
  394. return Xdelta1Runner()
  395. #end
  396. #end
  397. class Xdelta1Runner:
  398. def Encode(self, target, source, output):
  399. assert source != None
  400. args = ['xdelta1', 'delta', '-q', source, target, output]
  401. RunCommand(args, [0, 1])
  402. #end
  403. def Decode(self, input, source, output):
  404. assert source != None
  405. args = ['xdelta1', 'patch', '-q', input, source, output]
  406. # Note: for dumb historical reasons, xdelta1 returns 1 or 0
  407. RunCommand(args)
  408. #end
  409. def Verify(self, target, recon):
  410. RunCommand(('cmp', target, recon))
  411. #end
  412. def EncodeSize(self, output):
  413. return os.stat(output).st_size
  414. #end
  415. #end
  416. # exceptions
  417. class SkipRcsException:
  418. def __init__(self,reason):
  419. self.reason = reason
  420. #end
  421. #end
  422. class NotEnoughVersions:
  423. def __init__(self):
  424. pass
  425. #end
  426. #end
  427. class CommandError:
  428. def __init__(self,cmd,str):
  429. if type(cmd) is types.TupleType or \
  430. type(cmd) is types.ListType:
  431. cmd = reduce(lambda x,y: '%s %s' % (x,y),cmd)
  432. #end
  433. print 'command was: ',cmd
  434. print 'command failed: ',str
  435. print 'have fun debugging'
  436. #end
  437. #end
  438. class RcsVersion:
  439. def __init__(self,vstr):
  440. self.vstr = vstr
  441. #end
  442. def __cmp__(self,other):
  443. return cmp(self.date, other.date)
  444. #end
  445. def __str__(self):
  446. return str(self.vstr)
  447. #end
  448. #end
  449. class RcsFile:
  450. def __init__(self, fname):
  451. self.fname = fname
  452. self.versions = []
  453. self.state = HEAD_STATE
  454. #end
  455. def SetTotRev(self,s):
  456. self.totrev = int(s)
  457. #end
  458. def Rev(self,s):
  459. self.rev = RcsVersion(s)
  460. if len(self.versions) >= self.totrev:
  461. raise SkipRcsException('too many versions (in log messages)')
  462. #end
  463. self.versions.append(self.rev)
  464. #end
  465. def Date(self,s):
  466. self.rev.date = s
  467. #end
  468. def Match(self, line, state, rx, gp, newstate, f):
  469. if state == self.state:
  470. m = rx.match(line)
  471. if m:
  472. if f:
  473. f(m.group(gp))
  474. #end
  475. self.state = newstate
  476. return 1
  477. #end
  478. #end
  479. return None
  480. #end
  481. def Sum1Rlog(self):
  482. f = os.popen('rlog '+self.fname, "r")
  483. l = f.readline()
  484. while l:
  485. if self.Match(l, HEAD_STATE, RE_TOTREV, 1, BAR_STATE, self.SetTotRev):
  486. pass
  487. elif self.Match(l, BAR_STATE, RE_BAR, 1, REV_STATE, None):
  488. pass
  489. elif self.Match(l, REV_STATE, RE_REV, 1, DATE_STATE, self.Rev):
  490. pass
  491. elif self.Match(l, DATE_STATE, RE_DATE, 1, BAR_STATE, self.Date):
  492. pass
  493. #end
  494. l = f.readline()
  495. #end
  496. c = f.close()
  497. if c != None:
  498. raise c
  499. #end
  500. #end
  501. def Sum1(self):
  502. st = os.stat(self.fname)
  503. self.rcssize = st.st_size
  504. self.Sum1Rlog()
  505. if self.totrev != len(self.versions):
  506. raise SkipRcsException('wrong version count')
  507. #end
  508. self.versions.sort()
  509. #end
  510. def Checkout(self,n):
  511. v = self.versions[n]
  512. out = open(self.Verf(n), "w")
  513. cmd = 'co -ko -p%s %s' % (v.vstr, self.fname)
  514. total = 0
  515. (inf,
  516. stream,
  517. err) = os.popen3(cmd, "r")
  518. inf.close()
  519. buf = stream.read()
  520. while buf:
  521. total = total + len(buf)
  522. out.write(buf)
  523. buf = stream.read()
  524. #end
  525. v.vsize = total
  526. estr = ''
  527. buf = err.read()
  528. while buf:
  529. estr = estr + buf
  530. buf = err.read()
  531. #end
  532. if stream.close():
  533. raise CommandError(cmd, 'checkout failed: %s\n%s\n%s' % (v.vstr, self.fname, estr))
  534. #end
  535. out.close()
  536. err.close()
  537. #end
  538. def Vdate(self,n):
  539. return self.versions[n].date
  540. #end
  541. def Vstr(self,n):
  542. return self.versions[n].vstr
  543. #end
  544. def Verf(self,n):
  545. return os.path.join(TMPDIR, 'input.%d' % n)
  546. #end
  547. def FilePairsByDate(self, runclass):
  548. if self.totrev < 2:
  549. raise NotEnoughVersions()
  550. #end
  551. self.Checkout(0)
  552. ntrials = []
  553. if self.totrev < 2:
  554. return vtrials
  555. #end
  556. for v in range(0,self.totrev-1):
  557. if v > 1:
  558. os.remove(self.Verf(v-1))
  559. #end
  560. self.Checkout(v+1)
  561. if os.stat(self.Verf(v)).st_size < MIN_SIZE or \
  562. os.stat(self.Verf(v+1)).st_size < MIN_SIZE:
  563. continue
  564. #end
  565. result = TimedTest(self.Verf(v+1),
  566. self.Verf(v),
  567. runclass.New())
  568. target_size = os.stat(self.Verf(v+1)).st_size
  569. ntrials.append(result)
  570. #end
  571. os.remove(self.Verf(self.totrev-1))
  572. os.remove(self.Verf(self.totrev-2))
  573. return ntrials
  574. #end
  575. def AppendVersion(self, f, n):
  576. self.Checkout(n)
  577. rf = open(self.Verf(n), "r")
  578. data = rf.read()
  579. f.write(data)
  580. rf.close()
  581. return len(data)
  582. #end
  583. class RcsFinder:
  584. def __init__(self):
  585. self.subdirs = []
  586. self.rcsfiles = []
  587. self.others = []
  588. self.skipped = []
  589. self.biground = 0
  590. #end
  591. def Scan1(self,dir):
  592. dents = os.listdir(dir)
  593. subdirs = []
  594. rcsfiles = []
  595. others = []
  596. for dent in dents:
  597. full = os.path.join(dir, dent)
  598. if os.path.isdir(full):
  599. subdirs.append(full)
  600. elif dent[len(dent)-2:] == ",v":
  601. rcsfiles.append(RcsFile(full))
  602. else:
  603. others.append(full)
  604. #end
  605. #end
  606. self.subdirs = self.subdirs + subdirs
  607. self.rcsfiles = self.rcsfiles + rcsfiles
  608. self.others = self.others + others
  609. return subdirs
  610. #end
  611. def Crawl(self, dir):
  612. subdirs = [dir]
  613. while subdirs:
  614. s1 = self.Scan1(subdirs[0])
  615. subdirs = subdirs[1:] + s1
  616. #end
  617. #end
  618. def Summarize(self):
  619. good = []
  620. for rf in self.rcsfiles:
  621. try:
  622. rf.Sum1()
  623. if rf.totrev < 2:
  624. raise SkipRcsException('too few versions (< 2)')
  625. #end
  626. except SkipRcsException, e:
  627. #print 'skipping file %s: %s' % (rf.fname, e.reason)
  628. self.skipped.append(rf)
  629. else:
  630. good.append(rf)
  631. #end
  632. self.rcsfiles = good
  633. #end
  634. def AllPairsByDate(self, runclass):
  635. results = []
  636. good = []
  637. for rf in self.rcsfiles:
  638. try:
  639. results = results + rf.FilePairsByDate(runclass)
  640. except SkipRcsException:
  641. print 'file %s has compressed versions: skipping' % (rf.fname)
  642. except NotEnoughVersions:
  643. print 'testing %s on %s: not enough versions' % (runclass, rf.fname)
  644. else:
  645. good.append(rf)
  646. #end
  647. self.rcsfiles = good
  648. self.ReportPairs(runclass, results)
  649. return results
  650. #end
  651. def ReportPairs(self, name, results):
  652. encode_time = 0
  653. decode_time = 0
  654. encode_size = 0
  655. for r in results:
  656. encode_time += r.encode_time.mean
  657. decode_time += r.decode_time.mean
  658. encode_size += r.encode_size
  659. #end
  660. print '%s rcs: encode %.2f s: decode %.2f s: size %d' % \
  661. (name, encode_time, decode_time, encode_size)
  662. #end
  663. def MakeBigFiles(self, rand):
  664. f1 = open(TMPDIR + "/big.1", "w")
  665. f2 = open(TMPDIR + "/big.2", "w")
  666. population = []
  667. for file in self.rcsfiles:
  668. if len(file.versions) < 2:
  669. continue
  670. population.append(file)
  671. #end
  672. f1sz = 0
  673. f2sz = 0
  674. fcount = int(len(population) * FILE_P)
  675. assert fcount > 0
  676. for file in rand.sample(population, fcount):
  677. m = IGNORE_FILENAME.match(file.fname)
  678. if m != None:
  679. continue
  680. #end
  681. r1, r2 = rand.sample(xrange(0, len(file.versions)), 2)
  682. f1sz += file.AppendVersion(f1, r1)
  683. f2sz += file.AppendVersion(f2, r2)
  684. #m.update('%s,%s,%s ' % (file.fname[len(RCSDIR):],
  685. #file.Vstr(r1), file.Vstr(r2)))
  686. #end
  687. testkey = 'rcs%d' % self.biground
  688. self.biground = self.biground + 1
  689. print '%s; source %u bytes; target %u bytes' % (testkey, f1sz, f2sz)
  690. f1.close()
  691. f2.close()
  692. return (TMPDIR + "/big.1",
  693. TMPDIR + "/big.2",
  694. testkey)
  695. #end
  696. def Generator(self):
  697. return lambda rand: self.MakeBigFiles(rand)
  698. #end
  699. #end
  700. # find a set of RCS files for testing
  701. def GetTestRcsFiles():
  702. rcsf = RcsFinder()
  703. rcsf.Crawl(RCSDIR)
  704. if len(rcsf.rcsfiles) == 0:
  705. raise CommandError('', 'no RCS files')
  706. #end
  707. rcsf.Summarize()
  708. print "rcsfiles: rcsfiles %d; subdirs %d; others %d; skipped %d" % (
  709. len(rcsf.rcsfiles),
  710. len(rcsf.subdirs),
  711. len(rcsf.others),
  712. len(rcsf.skipped))
  713. print StatList([x.rcssize for x in rcsf.rcsfiles], "rcssize").str
  714. print StatList([x.totrev for x in rcsf.rcsfiles], "totrev").str
  715. return rcsf
  716. #end
  717. class SampleDataTest:
  718. def __init__(self, dirs):
  719. dirs_in = dirs
  720. self.pairs = []
  721. while dirs:
  722. d = dirs[0]
  723. dirs = dirs[1:]
  724. l = os.listdir(d)
  725. files = []
  726. for e in l:
  727. p = os.path.join(d, e)
  728. if os.path.isdir(p):
  729. dirs.append(p)
  730. else:
  731. files.append(p)
  732. #end
  733. #end
  734. if len(files) > 1:
  735. files.sort()
  736. for x in xrange(len(files)):
  737. for y in xrange(len(files)):
  738. self.pairs.append((files[x], files[y],
  739. '%s-%s' % (files[x], files[y])))
  740. #end
  741. #end
  742. #end
  743. #end
  744. print "Sample data test using %d file pairs in %s" % (
  745. len(self.pairs), dirs_in)
  746. #end
  747. def Generator(self):
  748. return lambda rand: rand.choice(self.pairs)
  749. #end
  750. #end
  751. # configs are represented as a list of values,
  752. # program takes a list of strings:
  753. def ConfigToArgs(config):
  754. args = [ '-C',
  755. ','.join([str(x) for x in config[0:SOFT_CONFIG_CNT]])]
  756. for i in range(SOFT_CONFIG_CNT, len(CONFIG_ORDER)):
  757. key = CONFIG_ARGMAP[CONFIG_ORDER[i]]
  758. val = config[i]
  759. if val == 'true' or val == 'false':
  760. if val == 'true':
  761. args.append('%s' % key)
  762. #end
  763. else:
  764. args.append('%s=%s' % (key, val))
  765. #end
  766. #end
  767. return args
  768. #end
  769. #
  770. class RandomTest:
  771. def __init__(self, tnum, tinput, config, syntuple = None):
  772. self.mytinput = tinput[2]
  773. self.myconfig = config
  774. self.tnum = tnum
  775. if syntuple != None:
  776. self.runtime = syntuple[0]
  777. self.compsize = syntuple[1]
  778. self.decodetime = None
  779. else:
  780. args = ConfigToArgs(config)
  781. result = TimedTest(tinput[1], tinput[0], Xdelta3Runner(args))
  782. self.runtime = result.encode_time.mean
  783. self.compsize = result.encode_size
  784. self.decodetime = result.decode_time.mean
  785. #end
  786. self.score = None
  787. self.time_pos = None
  788. self.size_pos = None
  789. self.score_pos = None
  790. #end
  791. def __str__(self):
  792. decodestr = ' %s' % self.decodetime
  793. return 'time %.6f%s size %d%s << %s >>%s' % (
  794. self.time(), ((self.time_pos != None) and
  795. (" (%s)" % self.time_pos) or ""),
  796. self.size(), ((self.size_pos != None) and
  797. (" (%s)" % self.size_pos) or ""),
  798. c2str(self.config()),
  799. decodestr)
  800. #end
  801. def time(self):
  802. return self.runtime
  803. #end
  804. def size(self):
  805. return self.compsize
  806. #end
  807. def config(self):
  808. return self.myconfig
  809. #end
  810. def score(self):
  811. return self.score
  812. #end
  813. def tinput(self):
  814. return self.mytinput
  815. #end
  816. #end
  817. def PosInAlist(l, e):
  818. for i in range(0, len(l)):
  819. if l[i][1] == e:
  820. return i;
  821. #end
  822. #end
  823. return -1
  824. #end
  825. # Generates a set of num_results test configurations, given the list of
  826. # retest-configs.
  827. def RandomTestConfigs(rand, input_configs, num_results):
  828. outputs = input_configs[:]
  829. have_set = dict([(c,c) for c in input_configs])
  830. # Compute a random configuration
  831. def RandomConfig():
  832. config = []
  833. cmap = {}
  834. for key in CONFIG_ORDER:
  835. val = cmap[key] = (INPUT_SPEC(rand)[key])(cmap)
  836. config.append(val)
  837. #end
  838. return tuple(config)
  839. #end
  840. while len(outputs) < num_results:
  841. newc = None
  842. for i in xrange(100):
  843. c = RandomConfig()
  844. if have_set.has_key(c):
  845. continue
  846. #end
  847. have_set[c] = c
  848. newc = c
  849. break
  850. if newc is None:
  851. print 'stopped looking for configs at %d' % len(outputs)
  852. break
  853. #end
  854. outputs.append(c)
  855. #end
  856. outputs.sort()
  857. return outputs
  858. #end
  859. def RunOptimizationLoop(rand, generator, rounds):
  860. configs = []
  861. for rnum in xrange(rounds):
  862. configs = RandomTestConfigs(rand, configs, MAX_RESULTS)
  863. tinput = generator(rand)
  864. tests = []
  865. for x in xrange(len(configs)):
  866. t = RandomTest(x, tinput, configs[x])
  867. print 'Round %d test %d: %s' % (rnum, x, t)
  868. tests.append(t)
  869. #end
  870. results = ScoreTests(tests)
  871. for r in results:
  872. c = r.config()
  873. if not test_all_config_results.has_key(c):
  874. test_all_config_results[c] = [r]
  875. else:
  876. test_all_config_results[c].append(r)
  877. #end
  878. #end
  879. #GraphResults('expt%d' % rnum, results)
  880. #GraphSummary('sum%d' % rnum, results)
  881. # re-test some fraction
  882. configs = [r.config() for r in results[0:int(MAX_RESULTS * KEEP_P)]]
  883. #end
  884. #end
  885. # TODO: cleanup
  886. test_all_config_results = {}
  887. def ScoreTests(results):
  888. scored = []
  889. timed = []
  890. sized = []
  891. t_min = float(min([test.time() for test in results]))
  892. #t_max = float(max([test.time() for test in results]))
  893. s_min = float(min([test.size() for test in results]))
  894. #s_max = float(max([test.size() for test in results]))
  895. for test in results:
  896. # Hyperbolic function. Smaller scores still better
  897. red = 0.999 # minimum factors for each dimension are 1/1000
  898. test.score = ((test.size() - s_min * red) *
  899. (test.time() - t_min * red))
  900. scored.append((test.score, test))
  901. timed.append((test.time(), test))
  902. sized.append((test.size(), test))
  903. #end
  904. scored.sort()
  905. timed.sort()
  906. sized.sort()
  907. best_by_size = []
  908. best_by_time = []
  909. pos = 0
  910. for (score, test) in scored:
  911. pos += 1
  912. test.score_pos = pos
  913. #end
  914. scored = [x[1] for x in scored]
  915. for test in scored:
  916. test.size_pos = PosInAlist(sized, test)
  917. test.time_pos = PosInAlist(timed, test)
  918. #end
  919. for test in scored:
  920. c = test.config()
  921. s = 0.0
  922. print 'H-Score: %0.9f %s' % (test.score, test)
  923. #end
  924. return scored
  925. #end
  926. def GraphResults(desc, results):
  927. f = open("data-%s.csv" % desc, "w")
  928. for r in results:
  929. f.write("%0.9f\t%d\t# %s\n" % (r.time(), r.size(), r))
  930. #end
  931. f.close()
  932. os.system("./plot.sh data-%s.csv plot-%s.jpg" % (desc, desc))
  933. #end
  934. def GraphSummary(desc, results_ignore):
  935. test_population = 0
  936. config_ordered = []
  937. # drops duplicate test/config pairs (TODO: don't retest them)
  938. for config, cresults in test_all_config_results.items():
  939. input_config_map = {}
  940. uniq = []
  941. for test in cresults:
  942. assert test.config() == config
  943. test_population += 1
  944. key = test.tinput()
  945. if not input_config_map.has_key(key):
  946. input_config_map[key] = {}
  947. #end
  948. if input_config_map[key].has_key(config):
  949. print 'skipping repeat test %s vs. %s' % (input_config_map[key][config], test)
  950. continue
  951. #end
  952. input_config_map[key][config] = test
  953. uniq.append(test)
  954. #end
  955. config_ordered.append(uniq)
  956. #end
  957. # sort configs descending by number of tests
  958. config_ordered.sort(lambda x, y: len(y) - len(x))
  959. print 'population %d: %d configs %d results' % \
  960. (test_population,
  961. len(config_ordered),
  962. len(config_ordered[0]))
  963. if config_ordered[0] == 1:
  964. return
  965. #end
  966. # a map from test-key to test-list w/ various configs
  967. input_set = {}
  968. osize = len(config_ordered)
  969. for i in xrange(len(config_ordered)):
  970. config = config_ordered[i][0].config()
  971. config_tests = config_ordered[i]
  972. #print '%s has %d tested inputs' % (config, len(config_tests))
  973. if len(input_set) == 0:
  974. input_set = dict([(t.tinput(), [t]) for t in config_tests])
  975. continue
  976. #end
  977. # a map from test-key to test-list w/ various configs
  978. update_set = {}
  979. for r in config_tests:
  980. t = r.tinput()
  981. if input_set.has_key(t):
  982. update_set[t] = input_set[t] + [r]
  983. else:
  984. #print 'config %s does not have test %s' % (config, t)
  985. pass
  986. #end
  987. #end
  988. if len(update_set) <= 1:
  989. break
  990. #end
  991. input_set = update_set
  992. # continue if there are more w/ the same number of inputs
  993. if i < (len(config_ordered) - 1) and \
  994. len(config_ordered[i + 1]) == len(config_tests):
  995. continue
  996. #end
  997. # synthesize results for multi-test inputs
  998. config_num = None
  999. # map of config to sum(various test-keys)
  1000. smap = {}
  1001. for (key, tests) in input_set.items():
  1002. if config_num == None:
  1003. # config_num should be the same in all elements
  1004. config_num = len(tests)
  1005. smap = dict([(r.config(),
  1006. (r.time(),
  1007. r.size()))
  1008. for r in tests])
  1009. else:
  1010. # compuate the per-config sum of time/size
  1011. assert config_num == len(tests)
  1012. smap = dict([(r.config(),
  1013. (smap[r.config()][0] + r.time(),
  1014. smap[r.config()][1] + r.size()))
  1015. for r in tests])
  1016. #end
  1017. #end
  1018. if config_num == 1:
  1019. continue
  1020. #end
  1021. if len(input_set) == osize:
  1022. break
  1023. #end
  1024. summary = '%s-%d' % (desc, len(input_set))
  1025. osize = len(input_set)
  1026. print 'generate %s w/ %d configs' % (summary, config_num)
  1027. syn = [RandomTest(0, (None, None, summary), config,
  1028. syntuple = (smap[config][0], smap[config][1]))
  1029. for config in smap.keys()]
  1030. syn = ScoreTests(syn)
  1031. #print 'smap is %s' % (smap,)
  1032. #print 'syn is %s' % (' and '.join([str(x) for x in syn]))
  1033. #GraphResults(summary, syn)
  1034. #end
  1035. #end
  1036. def RunRegressionTest(pairs, rounds):
  1037. for args in [
  1038. [],
  1039. ['-S=djw'],
  1040. ['-B=412907520'],
  1041. ['-B 412907520', ],
  1042. ]:
  1043. print "Args %s" % (args)
  1044. for (file1, file2, testkey) in pairs:
  1045. ttest = TimedTest(file1, file2, Xdelta3Runner(args, forkexec=True),
  1046. skip_trials = 0,
  1047. min_trials = 1,
  1048. max_trials = 1)
  1049. print "Source %s\nTarget %s\nEncode %s\nDecode %s\nSize %s\n\n" % (
  1050. file1, file2,
  1051. ttest.encode_time.str,
  1052. ttest.decode_time.str,
  1053. ttest.encode_size)
  1054. #end
  1055. #end
  1056. if __name__ == "__main__":
  1057. try:
  1058. RunCommand(['rm', '-rf', TMPDIR])
  1059. os.mkdir(TMPDIR)
  1060. #rcsf = GetTestRcsFiles()
  1061. #generator = rcsf.Generator()
  1062. sample = SampleDataTest([SAMPLEDIR])
  1063. generator = sample.Generator()
  1064. rand = random.Random(135135135135135)
  1065. RunRegressionTest(sample.pairs, TEST_ROUNDS)
  1066. #RunSpeedTest()
  1067. # the idea below is to add the default configurations and
  1068. # xdelta1 to the optimization loop:
  1069. #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-3', '-6']))
  1070. #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9']))
  1071. #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-S', 'djw']))
  1072. #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-S', 'djw']))
  1073. #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-T']))
  1074. #x1r = rcsf.AllPairsByDate(Xdelta1RunClass())
  1075. except CommandError:
  1076. pass
  1077. else:
  1078. RunCommand(['rm', '-rf', TMPDIR])
  1079. pass
  1080. #end
  1081. #end