utils.py 212 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305
  1. import asyncio
  2. import atexit
  3. import base64
  4. import binascii
  5. import calendar
  6. import codecs
  7. import collections
  8. import collections.abc
  9. import contextlib
  10. import datetime
  11. import email.header
  12. import email.utils
  13. import errno
  14. import gzip
  15. import hashlib
  16. import hmac
  17. import html.entities
  18. import html.parser
  19. import http.client
  20. import http.cookiejar
  21. import importlib.util
  22. import inspect
  23. import io
  24. import itertools
  25. import json
  26. import locale
  27. import math
  28. import mimetypes
  29. import operator
  30. import os
  31. import platform
  32. import random
  33. import re
  34. import shlex
  35. import socket
  36. import ssl
  37. import struct
  38. import subprocess
  39. import sys
  40. import tempfile
  41. import time
  42. import traceback
  43. import types
  44. import unicodedata
  45. import urllib.error
  46. import urllib.parse
  47. import urllib.request
  48. import xml.etree.ElementTree
  49. import zlib
  50. from .compat import functools # isort: split
  51. from .compat import (
  52. compat_etree_fromstring,
  53. compat_expanduser,
  54. compat_HTMLParseError,
  55. compat_os_name,
  56. compat_shlex_quote,
  57. )
  58. from .dependencies import brotli, certifi, websockets, xattr
  59. from .socks import ProxyType, sockssocket
  60. def register_socks_protocols():
  61. # "Register" SOCKS protocols
  62. # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  63. # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  64. for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  65. if scheme not in urllib.parse.uses_netloc:
  66. urllib.parse.uses_netloc.append(scheme)
  67. # This is not clearly defined otherwise
  68. compiled_regex_type = type(re.compile(''))
  69. def random_user_agent():
  70. _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  71. _CHROME_VERSIONS = (
  72. '90.0.4430.212',
  73. '90.0.4430.24',
  74. '90.0.4430.70',
  75. '90.0.4430.72',
  76. '90.0.4430.85',
  77. '90.0.4430.93',
  78. '91.0.4472.101',
  79. '91.0.4472.106',
  80. '91.0.4472.114',
  81. '91.0.4472.124',
  82. '91.0.4472.164',
  83. '91.0.4472.19',
  84. '91.0.4472.77',
  85. '92.0.4515.107',
  86. '92.0.4515.115',
  87. '92.0.4515.131',
  88. '92.0.4515.159',
  89. '92.0.4515.43',
  90. '93.0.4556.0',
  91. '93.0.4577.15',
  92. '93.0.4577.63',
  93. '93.0.4577.82',
  94. '94.0.4606.41',
  95. '94.0.4606.54',
  96. '94.0.4606.61',
  97. '94.0.4606.71',
  98. '94.0.4606.81',
  99. '94.0.4606.85',
  100. '95.0.4638.17',
  101. '95.0.4638.50',
  102. '95.0.4638.54',
  103. '95.0.4638.69',
  104. '95.0.4638.74',
  105. '96.0.4664.18',
  106. '96.0.4664.45',
  107. '96.0.4664.55',
  108. '96.0.4664.93',
  109. '97.0.4692.20',
  110. )
  111. return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
  112. SUPPORTED_ENCODINGS = [
  113. 'gzip', 'deflate'
  114. ]
  115. if brotli:
  116. SUPPORTED_ENCODINGS.append('br')
  117. std_headers = {
  118. 'User-Agent': random_user_agent(),
  119. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  120. 'Accept-Language': 'en-us,en;q=0.5',
  121. 'Sec-Fetch-Mode': 'navigate',
  122. }
  123. USER_AGENTS = {
  124. 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  125. }
  126. NO_DEFAULT = object()
  127. IDENTITY = lambda x: x
  128. ENGLISH_MONTH_NAMES = [
  129. 'January', 'February', 'March', 'April', 'May', 'June',
  130. 'July', 'August', 'September', 'October', 'November', 'December']
  131. MONTH_NAMES = {
  132. 'en': ENGLISH_MONTH_NAMES,
  133. 'fr': [
  134. 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  135. 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  136. # these follow the genitive grammatical case (dopełniacz)
  137. # some websites might be using nominative, which will require another month list
  138. # https://en.wikibooks.org/wiki/Polish/Noun_cases
  139. 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  140. 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  141. }
  142. # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  143. TIMEZONE_NAMES = {
  144. 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  145. 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
  146. 'EST': -5, 'EDT': -4, # Eastern
  147. 'CST': -6, 'CDT': -5, # Central
  148. 'MST': -7, 'MDT': -6, # Mountain
  149. 'PST': -8, 'PDT': -7 # Pacific
  150. }
  151. # needed for sanitizing filenames in restricted mode
  152. ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  153. itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
  154. 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
  155. DATE_FORMATS = (
  156. '%d %B %Y',
  157. '%d %b %Y',
  158. '%B %d %Y',
  159. '%B %dst %Y',
  160. '%B %dnd %Y',
  161. '%B %drd %Y',
  162. '%B %dth %Y',
  163. '%b %d %Y',
  164. '%b %dst %Y',
  165. '%b %dnd %Y',
  166. '%b %drd %Y',
  167. '%b %dth %Y',
  168. '%b %dst %Y %I:%M',
  169. '%b %dnd %Y %I:%M',
  170. '%b %drd %Y %I:%M',
  171. '%b %dth %Y %I:%M',
  172. '%Y %m %d',
  173. '%Y-%m-%d',
  174. '%Y.%m.%d.',
  175. '%Y/%m/%d',
  176. '%Y/%m/%d %H:%M',
  177. '%Y/%m/%d %H:%M:%S',
  178. '%Y%m%d%H%M',
  179. '%Y%m%d%H%M%S',
  180. '%Y%m%d',
  181. '%Y-%m-%d %H:%M',
  182. '%Y-%m-%d %H:%M:%S',
  183. '%Y-%m-%d %H:%M:%S.%f',
  184. '%Y-%m-%d %H:%M:%S:%f',
  185. '%d.%m.%Y %H:%M',
  186. '%d.%m.%Y %H.%M',
  187. '%Y-%m-%dT%H:%M:%SZ',
  188. '%Y-%m-%dT%H:%M:%S.%fZ',
  189. '%Y-%m-%dT%H:%M:%S.%f0Z',
  190. '%Y-%m-%dT%H:%M:%S',
  191. '%Y-%m-%dT%H:%M:%S.%f',
  192. '%Y-%m-%dT%H:%M',
  193. '%b %d %Y at %H:%M',
  194. '%b %d %Y at %H:%M:%S',
  195. '%B %d %Y at %H:%M',
  196. '%B %d %Y at %H:%M:%S',
  197. '%H:%M %d-%b-%Y',
  198. )
  199. DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
  200. DATE_FORMATS_DAY_FIRST.extend([
  201. '%d-%m-%Y',
  202. '%d.%m.%Y',
  203. '%d.%m.%y',
  204. '%d/%m/%Y',
  205. '%d/%m/%y',
  206. '%d/%m/%Y %H:%M:%S',
  207. '%d-%m-%Y %H:%M',
  208. ])
  209. DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
  210. DATE_FORMATS_MONTH_FIRST.extend([
  211. '%m-%d-%Y',
  212. '%m.%d.%Y',
  213. '%m/%d/%Y',
  214. '%m/%d/%y',
  215. '%m/%d/%Y %H:%M:%S',
  216. ])
  217. PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
  218. JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
  219. NUMBER_RE = r'\d+(?:\.\d+)?'
  220. @functools.cache
  221. def preferredencoding():
  222. """Get preferred encoding.
  223. Returns the best encoding scheme for the system, based on
  224. locale.getpreferredencoding() and some further tweaks.
  225. """
  226. try:
  227. pref = locale.getpreferredencoding()
  228. 'TEST'.encode(pref)
  229. except Exception:
  230. pref = 'UTF-8'
  231. return pref
  232. def write_json_file(obj, fn):
  233. """ Encode obj as JSON and write it to fn, atomically if possible """
  234. tf = tempfile.NamedTemporaryFile(
  235. prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
  236. suffix='.tmp', delete=False, mode='w', encoding='utf-8')
  237. try:
  238. with tf:
  239. json.dump(obj, tf, ensure_ascii=False)
  240. if sys.platform == 'win32':
  241. # Need to remove existing file on Windows, else os.rename raises
  242. # WindowsError or FileExistsError.
  243. with contextlib.suppress(OSError):
  244. os.unlink(fn)
  245. with contextlib.suppress(OSError):
  246. mask = os.umask(0)
  247. os.umask(mask)
  248. os.chmod(tf.name, 0o666 & ~mask)
  249. os.rename(tf.name, fn)
  250. except Exception:
  251. with contextlib.suppress(OSError):
  252. os.remove(tf.name)
  253. raise
  254. def find_xpath_attr(node, xpath, key, val=None):
  255. """ Find the xpath xpath[@key=val] """
  256. assert re.match(r'^[a-zA-Z_-]+$', key)
  257. expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
  258. return node.find(expr)
  259. # On python2.6 the xml.etree.ElementTree.Element methods don't support
  260. # the namespace parameter
  261. def xpath_with_ns(path, ns_map):
  262. components = [c.split(':') for c in path.split('/')]
  263. replaced = []
  264. for c in components:
  265. if len(c) == 1:
  266. replaced.append(c[0])
  267. else:
  268. ns, tag = c
  269. replaced.append('{%s}%s' % (ns_map[ns], tag))
  270. return '/'.join(replaced)
  271. def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  272. def _find_xpath(xpath):
  273. return node.find(xpath)
  274. if isinstance(xpath, str):
  275. n = _find_xpath(xpath)
  276. else:
  277. for xp in xpath:
  278. n = _find_xpath(xp)
  279. if n is not None:
  280. break
  281. if n is None:
  282. if default is not NO_DEFAULT:
  283. return default
  284. elif fatal:
  285. name = xpath if name is None else name
  286. raise ExtractorError('Could not find XML element %s' % name)
  287. else:
  288. return None
  289. return n
  290. def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  291. n = xpath_element(node, xpath, name, fatal=fatal, default=default)
  292. if n is None or n == default:
  293. return n
  294. if n.text is None:
  295. if default is not NO_DEFAULT:
  296. return default
  297. elif fatal:
  298. name = xpath if name is None else name
  299. raise ExtractorError('Could not find XML element\'s text %s' % name)
  300. else:
  301. return None
  302. return n.text
  303. def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
  304. n = find_xpath_attr(node, xpath, key)
  305. if n is None:
  306. if default is not NO_DEFAULT:
  307. return default
  308. elif fatal:
  309. name = f'{xpath}[@{key}]' if name is None else name
  310. raise ExtractorError('Could not find XML attribute %s' % name)
  311. else:
  312. return None
  313. return n.attrib[key]
  314. def get_element_by_id(id, html, **kwargs):
  315. """Return the content of the tag with the specified ID in the passed HTML document"""
  316. return get_element_by_attribute('id', id, html, **kwargs)
  317. def get_element_html_by_id(id, html, **kwargs):
  318. """Return the html of the tag with the specified ID in the passed HTML document"""
  319. return get_element_html_by_attribute('id', id, html, **kwargs)
  320. def get_element_by_class(class_name, html):
  321. """Return the content of the first tag with the specified class in the passed HTML document"""
  322. retval = get_elements_by_class(class_name, html)
  323. return retval[0] if retval else None
  324. def get_element_html_by_class(class_name, html):
  325. """Return the html of the first tag with the specified class in the passed HTML document"""
  326. retval = get_elements_html_by_class(class_name, html)
  327. return retval[0] if retval else None
  328. def get_element_by_attribute(attribute, value, html, **kwargs):
  329. retval = get_elements_by_attribute(attribute, value, html, **kwargs)
  330. return retval[0] if retval else None
  331. def get_element_html_by_attribute(attribute, value, html, **kargs):
  332. retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
  333. return retval[0] if retval else None
  334. def get_elements_by_class(class_name, html, **kargs):
  335. """Return the content of all tags with the specified class in the passed HTML document as a list"""
  336. return get_elements_by_attribute(
  337. 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
  338. html, escape_value=False)
  339. def get_elements_html_by_class(class_name, html):
  340. """Return the html of all tags with the specified class in the passed HTML document as a list"""
  341. return get_elements_html_by_attribute(
  342. 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
  343. html, escape_value=False)
  344. def get_elements_by_attribute(*args, **kwargs):
  345. """Return the content of the tag with the specified attribute in the passed HTML document"""
  346. return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
  347. def get_elements_html_by_attribute(*args, **kwargs):
  348. """Return the html of the tag with the specified attribute in the passed HTML document"""
  349. return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
  350. def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
  351. """
  352. Return the text (content) and the html (whole) of the tag with the specified
  353. attribute in the passed HTML document
  354. """
  355. if not value:
  356. return
  357. quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
  358. value = re.escape(value) if escape_value else value
  359. partial_element_re = rf'''(?x)
  360. <(?P<tag>{tag})
  361. (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
  362. \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
  363. '''
  364. for m in re.finditer(partial_element_re, html):
  365. content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
  366. yield (
  367. unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
  368. whole
  369. )
  370. class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
  371. """
  372. HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
  373. closing tag for the first opening tag it has encountered, and can be used
  374. as a context manager
  375. """
  376. class HTMLBreakOnClosingTagException(Exception):
  377. pass
  378. def __init__(self):
  379. self.tagstack = collections.deque()
  380. html.parser.HTMLParser.__init__(self)
  381. def __enter__(self):
  382. return self
  383. def __exit__(self, *_):
  384. self.close()
  385. def close(self):
  386. # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
  387. # so data remains buffered; we no longer have any interest in it, thus
  388. # override this method to discard it
  389. pass
  390. def handle_starttag(self, tag, _):
  391. self.tagstack.append(tag)
  392. def handle_endtag(self, tag):
  393. if not self.tagstack:
  394. raise compat_HTMLParseError('no tags in the stack')
  395. while self.tagstack:
  396. inner_tag = self.tagstack.pop()
  397. if inner_tag == tag:
  398. break
  399. else:
  400. raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
  401. if not self.tagstack:
  402. raise self.HTMLBreakOnClosingTagException()
  403. # XXX: This should be far less strict
  404. def get_element_text_and_html_by_tag(tag, html):
  405. """
  406. For the first element with the specified tag in the passed HTML document
  407. return its' content (text) and the whole element (html)
  408. """
  409. def find_or_raise(haystack, needle, exc):
  410. try:
  411. return haystack.index(needle)
  412. except ValueError:
  413. raise exc
  414. closing_tag = f'</{tag}>'
  415. whole_start = find_or_raise(
  416. html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
  417. content_start = find_or_raise(
  418. html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
  419. content_start += whole_start + 1
  420. with HTMLBreakOnClosingTagParser() as parser:
  421. parser.feed(html[whole_start:content_start])
  422. if not parser.tagstack or parser.tagstack[0] != tag:
  423. raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
  424. offset = content_start
  425. while offset < len(html):
  426. next_closing_tag_start = find_or_raise(
  427. html[offset:], closing_tag,
  428. compat_HTMLParseError(f'closing {tag} tag not found'))
  429. next_closing_tag_end = next_closing_tag_start + len(closing_tag)
  430. try:
  431. parser.feed(html[offset:offset + next_closing_tag_end])
  432. offset += next_closing_tag_end
  433. except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
  434. return html[content_start:offset + next_closing_tag_start], \
  435. html[whole_start:offset + next_closing_tag_end]
  436. raise compat_HTMLParseError('unexpected end of html')
  437. class HTMLAttributeParser(html.parser.HTMLParser):
  438. """Trivial HTML parser to gather the attributes for a single element"""
  439. def __init__(self):
  440. self.attrs = {}
  441. html.parser.HTMLParser.__init__(self)
  442. def handle_starttag(self, tag, attrs):
  443. self.attrs = dict(attrs)
  444. raise compat_HTMLParseError('done')
  445. class HTMLListAttrsParser(html.parser.HTMLParser):
  446. """HTML parser to gather the attributes for the elements of a list"""
  447. def __init__(self):
  448. html.parser.HTMLParser.__init__(self)
  449. self.items = []
  450. self._level = 0
  451. def handle_starttag(self, tag, attrs):
  452. if tag == 'li' and self._level == 0:
  453. self.items.append(dict(attrs))
  454. self._level += 1
  455. def handle_endtag(self, tag):
  456. self._level -= 1
  457. def extract_attributes(html_element):
  458. """Given a string for an HTML element such as
  459. <el
  460. a="foo" B="bar" c="&98;az" d=boz
  461. empty= noval entity="&amp;"
  462. sq='"' dq="'"
  463. >
  464. Decode and return a dictionary of attributes.
  465. {
  466. 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
  467. 'empty': '', 'noval': None, 'entity': '&',
  468. 'sq': '"', 'dq': '\''
  469. }.
  470. """
  471. parser = HTMLAttributeParser()
  472. with contextlib.suppress(compat_HTMLParseError):
  473. parser.feed(html_element)
  474. parser.close()
  475. return parser.attrs
  476. def parse_list(webpage):
  477. """Given a string for an series of HTML <li> elements,
  478. return a dictionary of their attributes"""
  479. parser = HTMLListAttrsParser()
  480. parser.feed(webpage)
  481. parser.close()
  482. return parser.items
  483. def clean_html(html):
  484. """Clean an HTML snippet into a readable string"""
  485. if html is None: # Convenience for sanitizing descriptions etc.
  486. return html
  487. html = re.sub(r'\s+', ' ', html)
  488. html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
  489. html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
  490. # Strip html tags
  491. html = re.sub('<.*?>', '', html)
  492. # Replace html entities
  493. html = unescapeHTML(html)
  494. return html.strip()
  495. class LenientJSONDecoder(json.JSONDecoder):
  496. def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
  497. self.transform_source, self.ignore_extra = transform_source, ignore_extra
  498. super().__init__(*args, **kwargs)
  499. def decode(self, s):
  500. if self.transform_source:
  501. s = self.transform_source(s)
  502. try:
  503. if self.ignore_extra:
  504. return self.raw_decode(s.lstrip())[0]
  505. return super().decode(s)
  506. except json.JSONDecodeError as e:
  507. if e.pos is not None:
  508. raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
  509. raise
  510. def sanitize_open(filename, open_mode):
  511. """Try to open the given filename, and slightly tweak it if this fails.
  512. Attempts to open the given filename. If this fails, it tries to change
  513. the filename slightly, step by step, until it's either able to open it
  514. or it fails and raises a final exception, like the standard open()
  515. function.
  516. It returns the tuple (stream, definitive_file_name).
  517. """
  518. if filename == '-':
  519. if sys.platform == 'win32':
  520. import msvcrt
  521. # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
  522. with contextlib.suppress(io.UnsupportedOperation):
  523. msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
  524. return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
  525. for attempt in range(2):
  526. try:
  527. try:
  528. if sys.platform == 'win32':
  529. # FIXME: An exclusive lock also locks the file from being read.
  530. # Since windows locks are mandatory, don't lock the file on windows (for now).
  531. # Ref: https://github.com/hypervideo/hypervideo/issues/3124
  532. raise LockingUnsupportedError()
  533. stream = locked_file(filename, open_mode, block=False).__enter__()
  534. except OSError:
  535. stream = open(filename, open_mode)
  536. return stream, filename
  537. except OSError as err:
  538. if attempt or err.errno in (errno.EACCES,):
  539. raise
  540. old_filename, filename = filename, sanitize_path(filename)
  541. if old_filename == filename:
  542. raise
  543. def timeconvert(timestr):
  544. """Convert RFC 2822 defined time string into system timestamp"""
  545. timestamp = None
  546. timetuple = email.utils.parsedate_tz(timestr)
  547. if timetuple is not None:
  548. timestamp = email.utils.mktime_tz(timetuple)
  549. return timestamp
  550. def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
  551. """Sanitizes a string so it could be used as part of a filename.
  552. @param restricted Use a stricter subset of allowed characters
  553. @param is_id Whether this is an ID that should be kept unchanged if possible.
  554. If unset, hypervideo's new sanitization rules are in effect
  555. """
  556. if s == '':
  557. return ''
  558. def replace_insane(char):
  559. if restricted and char in ACCENT_CHARS:
  560. return ACCENT_CHARS[char]
  561. elif not restricted and char == '\n':
  562. return '\0 '
  563. elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
  564. # Replace with their full-width unicode counterparts
  565. return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
  566. elif char == '?' or ord(char) < 32 or ord(char) == 127:
  567. return ''
  568. elif char == '"':
  569. return '' if restricted else '\''
  570. elif char == ':':
  571. return '\0_\0-' if restricted else '\0 \0-'
  572. elif char in '\\/|*<>':
  573. return '\0_'
  574. if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
  575. return '\0_'
  576. return char
  577. # Replace look-alike Unicode glyphs
  578. if restricted and (is_id is NO_DEFAULT or not is_id):
  579. s = unicodedata.normalize('NFKC', s)
  580. s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
  581. result = ''.join(map(replace_insane, s))
  582. if is_id is NO_DEFAULT:
  583. result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
  584. STRIP_RE = r'(?:\0.|[ _-])*'
  585. result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
  586. result = result.replace('\0', '') or '_'
  587. if not is_id:
  588. while '__' in result:
  589. result = result.replace('__', '_')
  590. result = result.strip('_')
  591. # Common case of "Foreign band name - English song title"
  592. if restricted and result.startswith('-_'):
  593. result = result[2:]
  594. if result.startswith('-'):
  595. result = '_' + result[len('-'):]
  596. result = result.lstrip('.')
  597. if not result:
  598. result = '_'
  599. return result
  600. def sanitize_path(s, force=False):
  601. """Sanitizes and normalizes path on Windows"""
  602. if sys.platform == 'win32':
  603. force = False
  604. drive_or_unc, _ = os.path.splitdrive(s)
  605. elif force:
  606. drive_or_unc = ''
  607. else:
  608. return s
  609. norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
  610. if drive_or_unc:
  611. norm_path.pop(0)
  612. sanitized_path = [
  613. path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
  614. for path_part in norm_path]
  615. if drive_or_unc:
  616. sanitized_path.insert(0, drive_or_unc + os.path.sep)
  617. elif force and s and s[0] == os.path.sep:
  618. sanitized_path.insert(0, os.path.sep)
  619. return os.path.join(*sanitized_path)
  620. def sanitize_url(url, *, scheme='http'):
  621. # Prepend protocol-less URLs with `http:` scheme in order to mitigate
  622. # the number of unwanted failures due to missing protocol
  623. if url is None:
  624. return
  625. elif url.startswith('//'):
  626. return f'{scheme}:{url}'
  627. # Fix some common typos seen so far
  628. COMMON_TYPOS = (
  629. # https://github.com/ytdl-org/youtube-dl/issues/15649
  630. (r'^httpss://', r'https://'),
  631. # https://bx1.be/lives/direct-tv/
  632. (r'^rmtp([es]?)://', r'rtmp\1://'),
  633. )
  634. for mistake, fixup in COMMON_TYPOS:
  635. if re.match(mistake, url):
  636. return re.sub(mistake, fixup, url)
  637. return url
  638. def extract_basic_auth(url):
  639. parts = urllib.parse.urlsplit(url)
  640. if parts.username is None:
  641. return url, None
  642. url = urllib.parse.urlunsplit(parts._replace(netloc=(
  643. parts.hostname if parts.port is None
  644. else '%s:%d' % (parts.hostname, parts.port))))
  645. auth_payload = base64.b64encode(
  646. ('%s:%s' % (parts.username, parts.password or '')).encode())
  647. return url, f'Basic {auth_payload.decode()}'
  648. def sanitized_Request(url, *args, **kwargs):
  649. url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
  650. if auth_header is not None:
  651. headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
  652. headers['Authorization'] = auth_header
  653. return urllib.request.Request(url, *args, **kwargs)
  654. def expand_path(s):
  655. """Expand shell variables and ~"""
  656. return os.path.expandvars(compat_expanduser(s))
  657. def orderedSet(iterable, *, lazy=False):
  658. """Remove all duplicates from the input iterable"""
  659. def _iter():
  660. seen = [] # Do not use set since the items can be unhashable
  661. for x in iterable:
  662. if x not in seen:
  663. seen.append(x)
  664. yield x
  665. return _iter() if lazy else list(_iter())
  666. def _htmlentity_transform(entity_with_semicolon):
  667. """Transforms an HTML entity to a character."""
  668. entity = entity_with_semicolon[:-1]
  669. # Known non-numeric HTML entity
  670. if entity in html.entities.name2codepoint:
  671. return chr(html.entities.name2codepoint[entity])
  672. # TODO: HTML5 allows entities without a semicolon.
  673. # E.g. '&Eacuteric' should be decoded as 'Éric'.
  674. if entity_with_semicolon in html.entities.html5:
  675. return html.entities.html5[entity_with_semicolon]
  676. mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
  677. if mobj is not None:
  678. numstr = mobj.group(1)
  679. if numstr.startswith('x'):
  680. base = 16
  681. numstr = '0%s' % numstr
  682. else:
  683. base = 10
  684. # See https://github.com/ytdl-org/youtube-dl/issues/7518
  685. with contextlib.suppress(ValueError):
  686. return chr(int(numstr, base))
  687. # Unknown entity in name, return its literal representation
  688. return '&%s;' % entity
  689. def unescapeHTML(s):
  690. if s is None:
  691. return None
  692. assert isinstance(s, str)
  693. return re.sub(
  694. r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
  695. def escapeHTML(text):
  696. return (
  697. text
  698. .replace('&', '&amp;')
  699. .replace('<', '&lt;')
  700. .replace('>', '&gt;')
  701. .replace('"', '&quot;')
  702. .replace("'", '&#39;')
  703. )
  704. def process_communicate_or_kill(p, *args, **kwargs):
  705. deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
  706. f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
  707. return Popen.communicate_or_kill(p, *args, **kwargs)
  708. class Popen(subprocess.Popen):
  709. if sys.platform == 'win32':
  710. _startupinfo = subprocess.STARTUPINFO()
  711. _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
  712. else:
  713. _startupinfo = None
  714. @staticmethod
  715. def _fix_pyinstaller_ld_path(env):
  716. """Restore LD_LIBRARY_PATH when using PyInstaller
  717. Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
  718. https://github.com/hypervideo/hypervideo/issues/4573
  719. """
  720. if not hasattr(sys, '_MEIPASS'):
  721. return
  722. def _fix(key):
  723. orig = env.get(f'{key}_ORIG')
  724. if orig is None:
  725. env.pop(key, None)
  726. else:
  727. env[key] = orig
  728. _fix('LD_LIBRARY_PATH') # Linux
  729. _fix('DYLD_LIBRARY_PATH') # macOS
  730. def __init__(self, *args, env=None, text=False, **kwargs):
  731. if env is None:
  732. env = os.environ.copy()
  733. self._fix_pyinstaller_ld_path(env)
  734. if text is True:
  735. kwargs['universal_newlines'] = True # For 3.6 compatibility
  736. kwargs.setdefault('encoding', 'utf-8')
  737. kwargs.setdefault('errors', 'replace')
  738. super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
  739. def communicate_or_kill(self, *args, **kwargs):
  740. try:
  741. return self.communicate(*args, **kwargs)
  742. except BaseException: # Including KeyboardInterrupt
  743. self.kill(timeout=None)
  744. raise
  745. def kill(self, *, timeout=0):
  746. super().kill()
  747. if timeout != 0:
  748. self.wait(timeout=timeout)
  749. @classmethod
  750. def run(cls, *args, timeout=None, **kwargs):
  751. with cls(*args, **kwargs) as proc:
  752. default = '' if proc.text_mode else b''
  753. stdout, stderr = proc.communicate_or_kill(timeout=timeout)
  754. return stdout or default, stderr or default, proc.returncode
  755. def get_subprocess_encoding():
  756. if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
  757. # For subprocess calls, encode with locale encoding
  758. # Refer to http://stackoverflow.com/a/9951851/35070
  759. encoding = preferredencoding()
  760. else:
  761. encoding = sys.getfilesystemencoding()
  762. if encoding is None:
  763. encoding = 'utf-8'
  764. return encoding
  765. def encodeFilename(s, for_subprocess=False):
  766. assert isinstance(s, str)
  767. return s
  768. def decodeFilename(b, for_subprocess=False):
  769. return b
  770. def encodeArgument(s):
  771. # Legacy code that uses byte strings
  772. # Uncomment the following line after fixing all post processors
  773. # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
  774. return s if isinstance(s, str) else s.decode('ascii')
  775. def decodeArgument(b):
  776. return b
  777. def decodeOption(optval):
  778. if optval is None:
  779. return optval
  780. if isinstance(optval, bytes):
  781. optval = optval.decode(preferredencoding())
  782. assert isinstance(optval, str)
  783. return optval
  784. _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
  785. def timetuple_from_msec(msec):
  786. secs, msec = divmod(msec, 1000)
  787. mins, secs = divmod(secs, 60)
  788. hrs, mins = divmod(mins, 60)
  789. return _timetuple(hrs, mins, secs, msec)
  790. def formatSeconds(secs, delim=':', msec=False):
  791. time = timetuple_from_msec(secs * 1000)
  792. if time.hours:
  793. ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
  794. elif time.minutes:
  795. ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
  796. else:
  797. ret = '%d' % time.seconds
  798. return '%s.%03d' % (ret, time.milliseconds) if msec else ret
  799. def _ssl_load_windows_store_certs(ssl_context, storename):
  800. # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
  801. try:
  802. certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
  803. if encoding == 'x509_asn' and (
  804. trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
  805. except PermissionError:
  806. return
  807. for cert in certs:
  808. with contextlib.suppress(ssl.SSLError):
  809. ssl_context.load_verify_locations(cadata=cert)
  810. def make_HTTPS_handler(params, **kwargs):
  811. opts_check_certificate = not params.get('nocheckcertificate')
  812. context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
  813. context.check_hostname = opts_check_certificate
  814. if params.get('legacyserverconnect'):
  815. context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
  816. # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
  817. context.set_ciphers('DEFAULT')
  818. elif (
  819. sys.version_info < (3, 10)
  820. and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
  821. and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
  822. ):
  823. # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
  824. # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
  825. # in some situations [2][3].
  826. # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
  827. # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
  828. # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
  829. # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
  830. # 2. https://github.com/hypervideo/hypervideo/issues/4627
  831. # 3. https://github.com/hypervideo/hypervideo/pull/5294
  832. # 4. https://peps.python.org/pep-0644/
  833. # 5. https://peps.python.org/pep-0644/#libressl-support
  834. # 6. https://github.com/hypervideo/hypervideo/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
  835. context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
  836. context.minimum_version = ssl.TLSVersion.TLSv1_2
  837. context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
  838. if opts_check_certificate:
  839. if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
  840. context.load_verify_locations(cafile=certifi.where())
  841. else:
  842. try:
  843. context.load_default_certs()
  844. # Work around the issue in load_default_certs when there are bad certificates. See:
  845. # https://github.com/hypervideo/hypervideo/issues/1060,
  846. # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
  847. except ssl.SSLError:
  848. # enum_certificates is not present in mingw python. See https://github.com/hypervideo/hypervideo/issues/1151
  849. if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
  850. for storename in ('CA', 'ROOT'):
  851. _ssl_load_windows_store_certs(context, storename)
  852. context.set_default_verify_paths()
  853. client_certfile = params.get('client_certificate')
  854. if client_certfile:
  855. try:
  856. context.load_cert_chain(
  857. client_certfile, keyfile=params.get('client_certificate_key'),
  858. password=params.get('client_certificate_password'))
  859. except ssl.SSLError:
  860. raise YoutubeDLError('Unable to load client certificate')
  861. # Some servers may reject requests if ALPN extension is not sent. See:
  862. # https://github.com/python/cpython/issues/85140
  863. # https://github.com/hypervideo/hypervideo/issues/3878
  864. with contextlib.suppress(NotImplementedError):
  865. context.set_alpn_protocols(['http/1.1'])
  866. return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
  867. def bug_reports_message(before=';'):
  868. msg = ('please report this issue on https://issues.hyperbola.info/ , '
  869. 'filling out the appropriate issue template. '
  870. 'Confirm you are on the latest version using pacman -Su')
  871. before = before.rstrip()
  872. if not before or before.endswith(('.', '!', '?')):
  873. msg = msg[0].title() + msg[1:]
  874. return (before + ' ' if before else '') + msg
  875. class YoutubeDLError(Exception):
  876. """Base exception for YoutubeDL errors."""
  877. msg = None
  878. def __init__(self, msg=None):
  879. if msg is not None:
  880. self.msg = msg
  881. elif self.msg is None:
  882. self.msg = type(self).__name__
  883. super().__init__(self.msg)
  884. network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
  885. if hasattr(ssl, 'CertificateError'):
  886. network_exceptions.append(ssl.CertificateError)
  887. network_exceptions = tuple(network_exceptions)
  888. class ExtractorError(YoutubeDLError):
  889. """Error during info extraction."""
  890. def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
  891. """ tb, if given, is the original traceback (so that it can be printed out).
  892. If expected is set, this is a normal error message and most likely not a bug in hypervideo.
  893. """
  894. if sys.exc_info()[0] in network_exceptions:
  895. expected = True
  896. self.orig_msg = str(msg)
  897. self.traceback = tb
  898. self.expected = expected
  899. self.cause = cause
  900. self.video_id = video_id
  901. self.ie = ie
  902. self.exc_info = sys.exc_info() # preserve original exception
  903. if isinstance(self.exc_info[1], ExtractorError):
  904. self.exc_info = self.exc_info[1].exc_info
  905. super().__init__(self.__msg)
  906. @property
  907. def __msg(self):
  908. return ''.join((
  909. format_field(self.ie, None, '[%s] '),
  910. format_field(self.video_id, None, '%s: '),
  911. self.orig_msg,
  912. format_field(self.cause, None, ' (caused by %r)'),
  913. '' if self.expected else bug_reports_message()))
  914. def format_traceback(self):
  915. return join_nonempty(
  916. self.traceback and ''.join(traceback.format_tb(self.traceback)),
  917. self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
  918. delim='\n') or None
  919. def __setattr__(self, name, value):
  920. super().__setattr__(name, value)
  921. if getattr(self, 'msg', None) and name not in ('msg', 'args'):
  922. self.msg = self.__msg or type(self).__name__
  923. self.args = (self.msg, ) # Cannot be property
  924. class UnsupportedError(ExtractorError):
  925. def __init__(self, url):
  926. super().__init__(
  927. 'Unsupported URL: %s' % url, expected=True)
  928. self.url = url
  929. class RegexNotFoundError(ExtractorError):
  930. """Error when a regex didn't match"""
  931. pass
  932. class GeoRestrictedError(ExtractorError):
  933. """Geographic restriction Error exception.
  934. This exception may be thrown when a video is not available from your
  935. geographic location due to geographic restrictions imposed by a website.
  936. """
  937. def __init__(self, msg, countries=None, **kwargs):
  938. kwargs['expected'] = True
  939. super().__init__(msg, **kwargs)
  940. self.countries = countries
  941. class UserNotLive(ExtractorError):
  942. """Error when a channel/user is not live"""
  943. def __init__(self, msg=None, **kwargs):
  944. kwargs['expected'] = True
  945. super().__init__(msg or 'The channel is not currently live', **kwargs)
  946. class DownloadError(YoutubeDLError):
  947. """Download Error exception.
  948. This exception may be thrown by FileDownloader objects if they are not
  949. configured to continue on errors. They will contain the appropriate
  950. error message.
  951. """
  952. def __init__(self, msg, exc_info=None):
  953. """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
  954. super().__init__(msg)
  955. self.exc_info = exc_info
  956. class EntryNotInPlaylist(YoutubeDLError):
  957. """Entry not in playlist exception.
  958. This exception will be thrown by YoutubeDL when a requested entry
  959. is not found in the playlist info_dict
  960. """
  961. msg = 'Entry not found in info'
  962. class SameFileError(YoutubeDLError):
  963. """Same File exception.
  964. This exception will be thrown by FileDownloader objects if they detect
  965. multiple files would have to be downloaded to the same file on disk.
  966. """
  967. msg = 'Fixed output name but more than one file to download'
  968. def __init__(self, filename=None):
  969. if filename is not None:
  970. self.msg += f': {filename}'
  971. super().__init__(self.msg)
  972. class PostProcessingError(YoutubeDLError):
  973. """Post Processing exception.
  974. This exception may be raised by PostProcessor's .run() method to
  975. indicate an error in the postprocessing task.
  976. """
  977. class DownloadCancelled(YoutubeDLError):
  978. """ Exception raised when the download queue should be interrupted """
  979. msg = 'The download was cancelled'
  980. class ExistingVideoReached(DownloadCancelled):
  981. """ --break-on-existing triggered """
  982. msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
  983. class RejectedVideoReached(DownloadCancelled):
  984. """ --break-on-reject triggered """
  985. msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
  986. class MaxDownloadsReached(DownloadCancelled):
  987. """ --max-downloads limit has been reached. """
  988. msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
  989. class ReExtractInfo(YoutubeDLError):
  990. """ Video info needs to be re-extracted. """
  991. def __init__(self, msg, expected=False):
  992. super().__init__(msg)
  993. self.expected = expected
  994. class ThrottledDownload(ReExtractInfo):
  995. """ Download speed below --throttled-rate. """
  996. msg = 'The download speed is below throttle limit'
  997. def __init__(self):
  998. super().__init__(self.msg, expected=False)
  999. class UnavailableVideoError(YoutubeDLError):
  1000. """Unavailable Format exception.
  1001. This exception will be thrown when a video is requested
  1002. in a format that is not available for that video.
  1003. """
  1004. msg = 'Unable to download video'
  1005. def __init__(self, err=None):
  1006. if err is not None:
  1007. self.msg += f': {err}'
  1008. super().__init__(self.msg)
  1009. class ContentTooShortError(YoutubeDLError):
  1010. """Content Too Short exception.
  1011. This exception may be raised by FileDownloader objects when a file they
  1012. download is too small for what the server announced first, indicating
  1013. the connection was probably interrupted.
  1014. """
  1015. def __init__(self, downloaded, expected):
  1016. super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
  1017. # Both in bytes
  1018. self.downloaded = downloaded
  1019. self.expected = expected
  1020. class XAttrMetadataError(YoutubeDLError):
  1021. def __init__(self, code=None, msg='Unknown error'):
  1022. super().__init__(msg)
  1023. self.code = code
  1024. self.msg = msg
  1025. # Parsing code and msg
  1026. if (self.code in (errno.ENOSPC, errno.EDQUOT)
  1027. or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
  1028. self.reason = 'NO_SPACE'
  1029. elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
  1030. self.reason = 'VALUE_TOO_LONG'
  1031. else:
  1032. self.reason = 'NOT_SUPPORTED'
  1033. class XAttrUnavailableError(YoutubeDLError):
  1034. pass
  1035. def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
  1036. hc = http_class(*args, **kwargs)
  1037. source_address = ydl_handler._params.get('source_address')
  1038. if source_address is not None:
  1039. # This is to workaround _create_connection() from socket where it will try all
  1040. # address data from getaddrinfo() including IPv6. This filters the result from
  1041. # getaddrinfo() based on the source_address value.
  1042. # This is based on the cpython socket.create_connection() function.
  1043. # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
  1044. def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
  1045. host, port = address
  1046. err = None
  1047. addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
  1048. af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
  1049. ip_addrs = [addr for addr in addrs if addr[0] == af]
  1050. if addrs and not ip_addrs:
  1051. ip_version = 'v4' if af == socket.AF_INET else 'v6'
  1052. raise OSError(
  1053. "No remote IP%s addresses available for connect, can't use '%s' as source address"
  1054. % (ip_version, source_address[0]))
  1055. for res in ip_addrs:
  1056. af, socktype, proto, canonname, sa = res
  1057. sock = None
  1058. try:
  1059. sock = socket.socket(af, socktype, proto)
  1060. if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
  1061. sock.settimeout(timeout)
  1062. sock.bind(source_address)
  1063. sock.connect(sa)
  1064. err = None # Explicitly break reference cycle
  1065. return sock
  1066. except OSError as _:
  1067. err = _
  1068. if sock is not None:
  1069. sock.close()
  1070. if err is not None:
  1071. raise err
  1072. else:
  1073. raise OSError('getaddrinfo returns an empty list')
  1074. if hasattr(hc, '_create_connection'):
  1075. hc._create_connection = _create_connection
  1076. hc.source_address = (source_address, 0)
  1077. return hc
  1078. def handle_youtubedl_headers(headers):
  1079. filtered_headers = headers
  1080. if 'Youtubedl-no-compression' in filtered_headers:
  1081. filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
  1082. del filtered_headers['Youtubedl-no-compression']
  1083. return filtered_headers
  1084. class YoutubeDLHandler(urllib.request.HTTPHandler):
  1085. """Handler for HTTP requests and responses.
  1086. This class, when installed with an OpenerDirector, automatically adds
  1087. the standard headers to every HTTP request and handles gzipped and
  1088. deflated responses from web servers. If compression is to be avoided in
  1089. a particular request, the original request in the program code only has
  1090. to include the HTTP header "Youtubedl-no-compression", which will be
  1091. removed before making the real request.
  1092. Part of this code was copied from:
  1093. http://techknack.net/python-urllib2-handlers/
  1094. Andrew Rowls, the author of that code, agreed to release it to the
  1095. public domain.
  1096. """
  1097. def __init__(self, params, *args, **kwargs):
  1098. urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
  1099. self._params = params
  1100. def http_open(self, req):
  1101. conn_class = http.client.HTTPConnection
  1102. socks_proxy = req.headers.get('Ytdl-socks-proxy')
  1103. if socks_proxy:
  1104. conn_class = make_socks_conn_class(conn_class, socks_proxy)
  1105. del req.headers['Ytdl-socks-proxy']
  1106. return self.do_open(functools.partial(
  1107. _create_http_connection, self, conn_class, False),
  1108. req)
  1109. @staticmethod
  1110. def deflate(data):
  1111. if not data:
  1112. return data
  1113. try:
  1114. return zlib.decompress(data, -zlib.MAX_WBITS)
  1115. except zlib.error:
  1116. return zlib.decompress(data)
  1117. @staticmethod
  1118. def brotli(data):
  1119. if not data:
  1120. return data
  1121. return brotli.decompress(data)
  1122. def http_request(self, req):
  1123. # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
  1124. # always respected by websites, some tend to give out URLs with non percent-encoded
  1125. # non-ASCII characters (see telemb.py, ard.py [#3412])
  1126. # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
  1127. # To work around aforementioned issue we will replace request's original URL with
  1128. # percent-encoded one
  1129. # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
  1130. # the code of this workaround has been moved here from YoutubeDL.urlopen()
  1131. url = req.get_full_url()
  1132. url_escaped = escape_url(url)
  1133. # Substitute URL if any change after escaping
  1134. if url != url_escaped:
  1135. req = update_Request(req, url=url_escaped)
  1136. for h, v in self._params.get('http_headers', std_headers).items():
  1137. # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
  1138. # The dict keys are capitalized because of this bug by urllib
  1139. if h.capitalize() not in req.headers:
  1140. req.add_header(h, v)
  1141. if 'Accept-encoding' not in req.headers:
  1142. req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
  1143. req.headers = handle_youtubedl_headers(req.headers)
  1144. return super().do_request_(req)
  1145. def http_response(self, req, resp):
  1146. old_resp = resp
  1147. # gzip
  1148. if resp.headers.get('Content-encoding', '') == 'gzip':
  1149. content = resp.read()
  1150. gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
  1151. try:
  1152. uncompressed = io.BytesIO(gz.read())
  1153. except OSError as original_ioerror:
  1154. # There may be junk add the end of the file
  1155. # See http://stackoverflow.com/q/4928560/35070 for details
  1156. for i in range(1, 1024):
  1157. try:
  1158. gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
  1159. uncompressed = io.BytesIO(gz.read())
  1160. except OSError:
  1161. continue
  1162. break
  1163. else:
  1164. raise original_ioerror
  1165. resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
  1166. resp.msg = old_resp.msg
  1167. del resp.headers['Content-encoding']
  1168. # deflate
  1169. if resp.headers.get('Content-encoding', '') == 'deflate':
  1170. gz = io.BytesIO(self.deflate(resp.read()))
  1171. resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
  1172. resp.msg = old_resp.msg
  1173. del resp.headers['Content-encoding']
  1174. # brotli
  1175. if resp.headers.get('Content-encoding', '') == 'br':
  1176. resp = urllib.request.addinfourl(
  1177. io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
  1178. resp.msg = old_resp.msg
  1179. del resp.headers['Content-encoding']
  1180. # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
  1181. # https://github.com/ytdl-org/youtube-dl/issues/6457).
  1182. if 300 <= resp.code < 400:
  1183. location = resp.headers.get('Location')
  1184. if location:
  1185. # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
  1186. location = location.encode('iso-8859-1').decode()
  1187. location_escaped = escape_url(location)
  1188. if location != location_escaped:
  1189. del resp.headers['Location']
  1190. resp.headers['Location'] = location_escaped
  1191. return resp
  1192. https_request = http_request
  1193. https_response = http_response
  1194. def make_socks_conn_class(base_class, socks_proxy):
  1195. assert issubclass(base_class, (
  1196. http.client.HTTPConnection, http.client.HTTPSConnection))
  1197. url_components = urllib.parse.urlparse(socks_proxy)
  1198. if url_components.scheme.lower() == 'socks5':
  1199. socks_type = ProxyType.SOCKS5
  1200. elif url_components.scheme.lower() in ('socks', 'socks4'):
  1201. socks_type = ProxyType.SOCKS4
  1202. elif url_components.scheme.lower() == 'socks4a':
  1203. socks_type = ProxyType.SOCKS4A
  1204. def unquote_if_non_empty(s):
  1205. if not s:
  1206. return s
  1207. return urllib.parse.unquote_plus(s)
  1208. proxy_args = (
  1209. socks_type,
  1210. url_components.hostname, url_components.port or 1080,
  1211. True, # Remote DNS
  1212. unquote_if_non_empty(url_components.username),
  1213. unquote_if_non_empty(url_components.password),
  1214. )
  1215. class SocksConnection(base_class):
  1216. def connect(self):
  1217. self.sock = sockssocket()
  1218. self.sock.setproxy(*proxy_args)
  1219. if isinstance(self.timeout, (int, float)):
  1220. self.sock.settimeout(self.timeout)
  1221. self.sock.connect((self.host, self.port))
  1222. if isinstance(self, http.client.HTTPSConnection):
  1223. if hasattr(self, '_context'): # Python > 2.6
  1224. self.sock = self._context.wrap_socket(
  1225. self.sock, server_hostname=self.host)
  1226. else:
  1227. self.sock = ssl.wrap_socket(self.sock)
  1228. return SocksConnection
  1229. class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
  1230. def __init__(self, params, https_conn_class=None, *args, **kwargs):
  1231. urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
  1232. self._https_conn_class = https_conn_class or http.client.HTTPSConnection
  1233. self._params = params
  1234. def https_open(self, req):
  1235. kwargs = {}
  1236. conn_class = self._https_conn_class
  1237. if hasattr(self, '_context'): # python > 2.6
  1238. kwargs['context'] = self._context
  1239. if hasattr(self, '_check_hostname'): # python 3.x
  1240. kwargs['check_hostname'] = self._check_hostname
  1241. socks_proxy = req.headers.get('Ytdl-socks-proxy')
  1242. if socks_proxy:
  1243. conn_class = make_socks_conn_class(conn_class, socks_proxy)
  1244. del req.headers['Ytdl-socks-proxy']
  1245. try:
  1246. return self.do_open(
  1247. functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
  1248. except urllib.error.URLError as e:
  1249. if (isinstance(e.reason, ssl.SSLError)
  1250. and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
  1251. raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
  1252. raise
  1253. def is_path_like(f):
  1254. return isinstance(f, (str, bytes, os.PathLike))
  1255. class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
  1256. """
  1257. See [1] for cookie file format.
  1258. 1. https://curl.haxx.se/docs/http-cookies.html
  1259. """
  1260. _HTTPONLY_PREFIX = '#HttpOnly_'
  1261. _ENTRY_LEN = 7
  1262. _HEADER = '''# Netscape HTTP Cookie File
  1263. # This file is generated by hypervideo. Do not edit.
  1264. '''
  1265. _CookieFileEntry = collections.namedtuple(
  1266. 'CookieFileEntry',
  1267. ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
  1268. def __init__(self, filename=None, *args, **kwargs):
  1269. super().__init__(None, *args, **kwargs)
  1270. if is_path_like(filename):
  1271. filename = os.fspath(filename)
  1272. self.filename = filename
  1273. @staticmethod
  1274. def _true_or_false(cndn):
  1275. return 'TRUE' if cndn else 'FALSE'
  1276. @contextlib.contextmanager
  1277. def open(self, file, *, write=False):
  1278. if is_path_like(file):
  1279. with open(file, 'w' if write else 'r', encoding='utf-8') as f:
  1280. yield f
  1281. else:
  1282. if write:
  1283. file.truncate(0)
  1284. yield file
  1285. def _really_save(self, f, ignore_discard=False, ignore_expires=False):
  1286. now = time.time()
  1287. for cookie in self:
  1288. if (not ignore_discard and cookie.discard
  1289. or not ignore_expires and cookie.is_expired(now)):
  1290. continue
  1291. name, value = cookie.name, cookie.value
  1292. if value is None:
  1293. # cookies.txt regards 'Set-Cookie: foo' as a cookie
  1294. # with no name, whereas http.cookiejar regards it as a
  1295. # cookie with no value.
  1296. name, value = '', name
  1297. f.write('%s\n' % '\t'.join((
  1298. cookie.domain,
  1299. self._true_or_false(cookie.domain.startswith('.')),
  1300. cookie.path,
  1301. self._true_or_false(cookie.secure),
  1302. str_or_none(cookie.expires, default=''),
  1303. name, value
  1304. )))
  1305. def save(self, filename=None, *args, **kwargs):
  1306. """
  1307. Save cookies to a file.
  1308. Code is taken from CPython 3.6
  1309. https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
  1310. if filename is None:
  1311. if self.filename is not None:
  1312. filename = self.filename
  1313. else:
  1314. raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
  1315. # Store session cookies with `expires` set to 0 instead of an empty string
  1316. for cookie in self:
  1317. if cookie.expires is None:
  1318. cookie.expires = 0
  1319. with self.open(filename, write=True) as f:
  1320. f.write(self._HEADER)
  1321. self._really_save(f, *args, **kwargs)
  1322. def load(self, filename=None, ignore_discard=False, ignore_expires=False):
  1323. """Load cookies from a file."""
  1324. if filename is None:
  1325. if self.filename is not None:
  1326. filename = self.filename
  1327. else:
  1328. raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
  1329. def prepare_line(line):
  1330. if line.startswith(self._HTTPONLY_PREFIX):
  1331. line = line[len(self._HTTPONLY_PREFIX):]
  1332. # comments and empty lines are fine
  1333. if line.startswith('#') or not line.strip():
  1334. return line
  1335. cookie_list = line.split('\t')
  1336. if len(cookie_list) != self._ENTRY_LEN:
  1337. raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
  1338. cookie = self._CookieFileEntry(*cookie_list)
  1339. if cookie.expires_at and not cookie.expires_at.isdigit():
  1340. raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
  1341. return line
  1342. cf = io.StringIO()
  1343. with self.open(filename) as f:
  1344. for line in f:
  1345. try:
  1346. cf.write(prepare_line(line))
  1347. except http.cookiejar.LoadError as e:
  1348. if f'{line.strip()} '[0] in '[{"':
  1349. raise http.cookiejar.LoadError(
  1350. 'Cookies file must be Netscape formatted, not JSON. See '
  1351. 'https://github.com/hypervideo/hypervideo/wiki/FAQ#how-do-i-pass-cookies-to-hypervideo')
  1352. write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
  1353. continue
  1354. cf.seek(0)
  1355. self._really_load(cf, filename, ignore_discard, ignore_expires)
  1356. # Session cookies are denoted by either `expires` field set to
  1357. # an empty string or 0. MozillaCookieJar only recognizes the former
  1358. # (see [1]). So we need force the latter to be recognized as session
  1359. # cookies on our own.
  1360. # Session cookies may be important for cookies-based authentication,
  1361. # e.g. usually, when user does not check 'Remember me' check box while
  1362. # logging in on a site, some important cookies are stored as session
  1363. # cookies so that not recognizing them will result in failed login.
  1364. # 1. https://bugs.python.org/issue17164
  1365. for cookie in self:
  1366. # Treat `expires=0` cookies as session cookies
  1367. if cookie.expires == 0:
  1368. cookie.expires = None
  1369. cookie.discard = True
  1370. class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
  1371. def __init__(self, cookiejar=None):
  1372. urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
  1373. def http_response(self, request, response):
  1374. return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
  1375. https_request = urllib.request.HTTPCookieProcessor.http_request
  1376. https_response = http_response
  1377. class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
  1378. """YoutubeDL redirect handler
  1379. The code is based on HTTPRedirectHandler implementation from CPython [1].
  1380. This redirect handler solves two issues:
  1381. - ensures redirect URL is always unicode under python 2
  1382. - introduces support for experimental HTTP response status code
  1383. 308 Permanent Redirect [2] used by some sites [3]
  1384. 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
  1385. 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
  1386. 3. https://github.com/ytdl-org/youtube-dl/issues/28768
  1387. """
  1388. http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
  1389. def redirect_request(self, req, fp, code, msg, headers, newurl):
  1390. """Return a Request or None in response to a redirect.
  1391. This is called by the http_error_30x methods when a
  1392. redirection response is received. If a redirection should
  1393. take place, return a new Request to allow http_error_30x to
  1394. perform the redirect. Otherwise, raise HTTPError if no-one
  1395. else should try to handle this url. Return None if you can't
  1396. but another Handler might.
  1397. """
  1398. m = req.get_method()
  1399. if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
  1400. or code in (301, 302, 303) and m == "POST")):
  1401. raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
  1402. # Strictly (according to RFC 2616), 301 or 302 in response to
  1403. # a POST MUST NOT cause a redirection without confirmation
  1404. # from the user (of urllib.request, in this case). In practice,
  1405. # essentially all clients do redirect in this case, so we do
  1406. # the same.
  1407. # Be conciliant with URIs containing a space. This is mainly
  1408. # redundant with the more complete encoding done in http_error_302(),
  1409. # but it is kept for compatibility with other callers.
  1410. newurl = newurl.replace(' ', '%20')
  1411. CONTENT_HEADERS = ("content-length", "content-type")
  1412. # NB: don't use dict comprehension for python 2.6 compatibility
  1413. newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
  1414. # A 303 must either use GET or HEAD for subsequent request
  1415. # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
  1416. if code == 303 and m != 'HEAD':
  1417. m = 'GET'
  1418. # 301 and 302 redirects are commonly turned into a GET from a POST
  1419. # for subsequent requests by browsers, so we'll do the same.
  1420. # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
  1421. # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
  1422. if code in (301, 302) and m == 'POST':
  1423. m = 'GET'
  1424. return urllib.request.Request(
  1425. newurl, headers=newheaders, origin_req_host=req.origin_req_host,
  1426. unverifiable=True, method=m)
  1427. def extract_timezone(date_str):
  1428. m = re.search(
  1429. r'''(?x)
  1430. ^.{8,}? # >=8 char non-TZ prefix, if present
  1431. (?P<tz>Z| # just the UTC Z, or
  1432. (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
  1433. (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
  1434. [ ]? # optional space
  1435. (?P<sign>\+|-) # +/-
  1436. (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
  1437. $)
  1438. ''', date_str)
  1439. if not m:
  1440. m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
  1441. timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
  1442. if timezone is not None:
  1443. date_str = date_str[:-len(m.group('tz'))]
  1444. timezone = datetime.timedelta(hours=timezone or 0)
  1445. else:
  1446. date_str = date_str[:-len(m.group('tz'))]
  1447. if not m.group('sign'):
  1448. timezone = datetime.timedelta()
  1449. else:
  1450. sign = 1 if m.group('sign') == '+' else -1
  1451. timezone = datetime.timedelta(
  1452. hours=sign * int(m.group('hours')),
  1453. minutes=sign * int(m.group('minutes')))
  1454. return timezone, date_str
  1455. def parse_iso8601(date_str, delimiter='T', timezone=None):
  1456. """ Return a UNIX timestamp from the given date """
  1457. if date_str is None:
  1458. return None
  1459. date_str = re.sub(r'\.[0-9]+', '', date_str)
  1460. if timezone is None:
  1461. timezone, date_str = extract_timezone(date_str)
  1462. with contextlib.suppress(ValueError):
  1463. date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
  1464. dt = datetime.datetime.strptime(date_str, date_format) - timezone
  1465. return calendar.timegm(dt.timetuple())
  1466. def date_formats(day_first=True):
  1467. return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
  1468. def unified_strdate(date_str, day_first=True):
  1469. """Return a string with the date in the format YYYYMMDD"""
  1470. if date_str is None:
  1471. return None
  1472. upload_date = None
  1473. # Replace commas
  1474. date_str = date_str.replace(',', ' ')
  1475. # Remove AM/PM + timezone
  1476. date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
  1477. _, date_str = extract_timezone(date_str)
  1478. for expression in date_formats(day_first):
  1479. with contextlib.suppress(ValueError):
  1480. upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
  1481. if upload_date is None:
  1482. timetuple = email.utils.parsedate_tz(date_str)
  1483. if timetuple:
  1484. with contextlib.suppress(ValueError):
  1485. upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
  1486. if upload_date is not None:
  1487. return str(upload_date)
  1488. def unified_timestamp(date_str, day_first=True):
  1489. if date_str is None:
  1490. return None
  1491. date_str = re.sub(r'\s+', ' ', re.sub(
  1492. r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
  1493. pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
  1494. timezone, date_str = extract_timezone(date_str)
  1495. # Remove AM/PM + timezone
  1496. date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
  1497. # Remove unrecognized timezones from ISO 8601 alike timestamps
  1498. m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
  1499. if m:
  1500. date_str = date_str[:-len(m.group('tz'))]
  1501. # Python only supports microseconds, so remove nanoseconds
  1502. m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
  1503. if m:
  1504. date_str = m.group(1)
  1505. for expression in date_formats(day_first):
  1506. with contextlib.suppress(ValueError):
  1507. dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
  1508. return calendar.timegm(dt.timetuple())
  1509. timetuple = email.utils.parsedate_tz(date_str)
  1510. if timetuple:
  1511. return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
  1512. def determine_ext(url, default_ext='unknown_video'):
  1513. if url is None or '.' not in url:
  1514. return default_ext
  1515. guess = url.partition('?')[0].rpartition('.')[2]
  1516. if re.match(r'^[A-Za-z0-9]+$', guess):
  1517. return guess
  1518. # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
  1519. elif guess.rstrip('/') in KNOWN_EXTENSIONS:
  1520. return guess.rstrip('/')
  1521. else:
  1522. return default_ext
  1523. def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
  1524. return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
  1525. def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
  1526. R"""
  1527. Return a datetime object from a string.
  1528. Supported format:
  1529. (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
  1530. @param format strftime format of DATE
  1531. @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
  1532. auto: round to the unit provided in date_str (if applicable).
  1533. """
  1534. auto_precision = False
  1535. if precision == 'auto':
  1536. auto_precision = True
  1537. precision = 'microsecond'
  1538. today = datetime_round(datetime.datetime.utcnow(), precision)
  1539. if date_str in ('now', 'today'):
  1540. return today
  1541. if date_str == 'yesterday':
  1542. return today - datetime.timedelta(days=1)
  1543. match = re.match(
  1544. r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
  1545. date_str)
  1546. if match is not None:
  1547. start_time = datetime_from_str(match.group('start'), precision, format)
  1548. time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
  1549. unit = match.group('unit')
  1550. if unit == 'month' or unit == 'year':
  1551. new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
  1552. unit = 'day'
  1553. else:
  1554. if unit == 'week':
  1555. unit = 'day'
  1556. time *= 7
  1557. delta = datetime.timedelta(**{unit + 's': time})
  1558. new_date = start_time + delta
  1559. if auto_precision:
  1560. return datetime_round(new_date, unit)
  1561. return new_date
  1562. return datetime_round(datetime.datetime.strptime(date_str, format), precision)
  1563. def date_from_str(date_str, format='%Y%m%d', strict=False):
  1564. R"""
  1565. Return a date object from a string using datetime_from_str
  1566. @param strict Restrict allowed patterns to "YYYYMMDD" and
  1567. (now|today|yesterday)(-\d+(day|week|month|year)s?)?
  1568. """
  1569. if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
  1570. raise ValueError(f'Invalid date format "{date_str}"')
  1571. return datetime_from_str(date_str, precision='microsecond', format=format).date()
  1572. def datetime_add_months(dt, months):
  1573. """Increment/Decrement a datetime object by months."""
  1574. month = dt.month + months - 1
  1575. year = dt.year + month // 12
  1576. month = month % 12 + 1
  1577. day = min(dt.day, calendar.monthrange(year, month)[1])
  1578. return dt.replace(year, month, day)
  1579. def datetime_round(dt, precision='day'):
  1580. """
  1581. Round a datetime object's time to a specific precision
  1582. """
  1583. if precision == 'microsecond':
  1584. return dt
  1585. unit_seconds = {
  1586. 'day': 86400,
  1587. 'hour': 3600,
  1588. 'minute': 60,
  1589. 'second': 1,
  1590. }
  1591. roundto = lambda x, n: ((x + n / 2) // n) * n
  1592. timestamp = calendar.timegm(dt.timetuple())
  1593. return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
  1594. def hyphenate_date(date_str):
  1595. """
  1596. Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
  1597. match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
  1598. if match is not None:
  1599. return '-'.join(match.groups())
  1600. else:
  1601. return date_str
  1602. class DateRange:
  1603. """Represents a time interval between two dates"""
  1604. def __init__(self, start=None, end=None):
  1605. """start and end must be strings in the format accepted by date"""
  1606. if start is not None:
  1607. self.start = date_from_str(start, strict=True)
  1608. else:
  1609. self.start = datetime.datetime.min.date()
  1610. if end is not None:
  1611. self.end = date_from_str(end, strict=True)
  1612. else:
  1613. self.end = datetime.datetime.max.date()
  1614. if self.start > self.end:
  1615. raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
  1616. @classmethod
  1617. def day(cls, day):
  1618. """Returns a range that only contains the given day"""
  1619. return cls(day, day)
  1620. def __contains__(self, date):
  1621. """Check if the date is in the range"""
  1622. if not isinstance(date, datetime.date):
  1623. date = date_from_str(date)
  1624. return self.start <= date <= self.end
  1625. def __str__(self):
  1626. return f'{self.start.isoformat()} - {self.end.isoformat()}'
  1627. def __eq__(self, other):
  1628. return (isinstance(other, DateRange)
  1629. and self.start == other.start and self.end == other.end)
  1630. def platform_name():
  1631. """ Returns the platform name as a str """
  1632. deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
  1633. return platform.platform()
  1634. @functools.cache
  1635. def system_identifier():
  1636. python_implementation = platform.python_implementation()
  1637. if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
  1638. python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
  1639. libc_ver = []
  1640. with contextlib.suppress(OSError): # We may not have access to the executable
  1641. libc_ver = platform.libc_ver()
  1642. return 'Python %s (%s %s %s) - %s (%s%s)' % (
  1643. platform.python_version(),
  1644. python_implementation,
  1645. platform.machine(),
  1646. platform.architecture()[0],
  1647. platform.platform(),
  1648. ssl.OPENSSL_VERSION,
  1649. format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
  1650. )
  1651. @functools.cache
  1652. def get_windows_version():
  1653. ''' Get Windows version. returns () if it's not running on Windows '''
  1654. if compat_os_name == 'nt':
  1655. return version_tuple(platform.win32_ver()[1])
  1656. else:
  1657. return ()
  1658. def write_string(s, out=None, encoding=None):
  1659. assert isinstance(s, str)
  1660. out = out or sys.stderr
  1661. if compat_os_name == 'nt' and supports_terminal_sequences(out):
  1662. s = re.sub(r'([\r\n]+)', r' \1', s)
  1663. enc, buffer = None, out
  1664. if 'b' in getattr(out, 'mode', ''):
  1665. enc = encoding or preferredencoding()
  1666. elif hasattr(out, 'buffer'):
  1667. buffer = out.buffer
  1668. enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
  1669. buffer.write(s.encode(enc, 'ignore') if enc else s)
  1670. out.flush()
  1671. def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
  1672. from . import _IN_CLI
  1673. if _IN_CLI:
  1674. if msg in deprecation_warning._cache:
  1675. return
  1676. deprecation_warning._cache.add(msg)
  1677. if printer:
  1678. return printer(f'{msg}{bug_reports_message()}', **kwargs)
  1679. return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
  1680. else:
  1681. import warnings
  1682. warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
  1683. deprecation_warning._cache = set()
  1684. def bytes_to_intlist(bs):
  1685. if not bs:
  1686. return []
  1687. if isinstance(bs[0], int): # Python 3
  1688. return list(bs)
  1689. else:
  1690. return [ord(c) for c in bs]
  1691. def intlist_to_bytes(xs):
  1692. if not xs:
  1693. return b''
  1694. return struct.pack('%dB' % len(xs), *xs)
  1695. class LockingUnsupportedError(OSError):
  1696. msg = 'File locking is not supported'
  1697. def __init__(self):
  1698. super().__init__(self.msg)
  1699. # Cross-platform file locking
  1700. if sys.platform == 'win32':
  1701. import ctypes
  1702. import ctypes.wintypes
  1703. import msvcrt
  1704. class OVERLAPPED(ctypes.Structure):
  1705. _fields_ = [
  1706. ('Internal', ctypes.wintypes.LPVOID),
  1707. ('InternalHigh', ctypes.wintypes.LPVOID),
  1708. ('Offset', ctypes.wintypes.DWORD),
  1709. ('OffsetHigh', ctypes.wintypes.DWORD),
  1710. ('hEvent', ctypes.wintypes.HANDLE),
  1711. ]
  1712. kernel32 = ctypes.windll.kernel32
  1713. LockFileEx = kernel32.LockFileEx
  1714. LockFileEx.argtypes = [
  1715. ctypes.wintypes.HANDLE, # hFile
  1716. ctypes.wintypes.DWORD, # dwFlags
  1717. ctypes.wintypes.DWORD, # dwReserved
  1718. ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
  1719. ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
  1720. ctypes.POINTER(OVERLAPPED) # Overlapped
  1721. ]
  1722. LockFileEx.restype = ctypes.wintypes.BOOL
  1723. UnlockFileEx = kernel32.UnlockFileEx
  1724. UnlockFileEx.argtypes = [
  1725. ctypes.wintypes.HANDLE, # hFile
  1726. ctypes.wintypes.DWORD, # dwReserved
  1727. ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
  1728. ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
  1729. ctypes.POINTER(OVERLAPPED) # Overlapped
  1730. ]
  1731. UnlockFileEx.restype = ctypes.wintypes.BOOL
  1732. whole_low = 0xffffffff
  1733. whole_high = 0x7fffffff
  1734. def _lock_file(f, exclusive, block):
  1735. overlapped = OVERLAPPED()
  1736. overlapped.Offset = 0
  1737. overlapped.OffsetHigh = 0
  1738. overlapped.hEvent = 0
  1739. f._lock_file_overlapped_p = ctypes.pointer(overlapped)
  1740. if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
  1741. (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
  1742. 0, whole_low, whole_high, f._lock_file_overlapped_p):
  1743. # NB: No argument form of "ctypes.FormatError" does not work on PyPy
  1744. raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
  1745. def _unlock_file(f):
  1746. assert f._lock_file_overlapped_p
  1747. handle = msvcrt.get_osfhandle(f.fileno())
  1748. if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
  1749. raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
  1750. else:
  1751. try:
  1752. import fcntl
  1753. def _lock_file(f, exclusive, block):
  1754. flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
  1755. if not block:
  1756. flags |= fcntl.LOCK_NB
  1757. try:
  1758. fcntl.flock(f, flags)
  1759. except BlockingIOError:
  1760. raise
  1761. except OSError: # AOSP does not have flock()
  1762. fcntl.lockf(f, flags)
  1763. def _unlock_file(f):
  1764. try:
  1765. fcntl.flock(f, fcntl.LOCK_UN)
  1766. except OSError:
  1767. fcntl.lockf(f, fcntl.LOCK_UN)
  1768. except ImportError:
  1769. def _lock_file(f, exclusive, block):
  1770. raise LockingUnsupportedError()
  1771. def _unlock_file(f):
  1772. raise LockingUnsupportedError()
  1773. class locked_file:
  1774. locked = False
  1775. def __init__(self, filename, mode, block=True, encoding=None):
  1776. if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
  1777. raise NotImplementedError(mode)
  1778. self.mode, self.block = mode, block
  1779. writable = any(f in mode for f in 'wax+')
  1780. readable = any(f in mode for f in 'r+')
  1781. flags = functools.reduce(operator.ior, (
  1782. getattr(os, 'O_CLOEXEC', 0), # UNIX only
  1783. getattr(os, 'O_BINARY', 0), # Windows only
  1784. getattr(os, 'O_NOINHERIT', 0), # Windows only
  1785. os.O_CREAT if writable else 0, # O_TRUNC only after locking
  1786. os.O_APPEND if 'a' in mode else 0,
  1787. os.O_EXCL if 'x' in mode else 0,
  1788. os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
  1789. ))
  1790. self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
  1791. def __enter__(self):
  1792. exclusive = 'r' not in self.mode
  1793. try:
  1794. _lock_file(self.f, exclusive, self.block)
  1795. self.locked = True
  1796. except OSError:
  1797. self.f.close()
  1798. raise
  1799. if 'w' in self.mode:
  1800. try:
  1801. self.f.truncate()
  1802. except OSError as e:
  1803. if e.errno not in (
  1804. errno.ESPIPE, # Illegal seek - expected for FIFO
  1805. errno.EINVAL, # Invalid argument - expected for /dev/null
  1806. ):
  1807. raise
  1808. return self
  1809. def unlock(self):
  1810. if not self.locked:
  1811. return
  1812. try:
  1813. _unlock_file(self.f)
  1814. finally:
  1815. self.locked = False
  1816. def __exit__(self, *_):
  1817. try:
  1818. self.unlock()
  1819. finally:
  1820. self.f.close()
  1821. open = __enter__
  1822. close = __exit__
  1823. def __getattr__(self, attr):
  1824. return getattr(self.f, attr)
  1825. def __iter__(self):
  1826. return iter(self.f)
  1827. @functools.cache
  1828. def get_filesystem_encoding():
  1829. encoding = sys.getfilesystemencoding()
  1830. return encoding if encoding is not None else 'utf-8'
  1831. def shell_quote(args):
  1832. quoted_args = []
  1833. encoding = get_filesystem_encoding()
  1834. for a in args:
  1835. if isinstance(a, bytes):
  1836. # We may get a filename encoded with 'encodeFilename'
  1837. a = a.decode(encoding)
  1838. quoted_args.append(compat_shlex_quote(a))
  1839. return ' '.join(quoted_args)
  1840. def smuggle_url(url, data):
  1841. """ Pass additional data in a URL for internal use. """
  1842. url, idata = unsmuggle_url(url, {})
  1843. data.update(idata)
  1844. sdata = urllib.parse.urlencode(
  1845. {'__youtubedl_smuggle': json.dumps(data)})
  1846. return url + '#' + sdata
  1847. def unsmuggle_url(smug_url, default=None):
  1848. if '#__youtubedl_smuggle' not in smug_url:
  1849. return smug_url, default
  1850. url, _, sdata = smug_url.rpartition('#')
  1851. jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
  1852. data = json.loads(jsond)
  1853. return url, data
  1854. def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
  1855. """ Formats numbers with decimal sufixes like K, M, etc """
  1856. num, factor = float_or_none(num), float(factor)
  1857. if num is None or num < 0:
  1858. return None
  1859. POSSIBLE_SUFFIXES = 'kMGTPEZY'
  1860. exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
  1861. suffix = ['', *POSSIBLE_SUFFIXES][exponent]
  1862. if factor == 1024:
  1863. suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
  1864. converted = num / (factor ** exponent)
  1865. return fmt % (converted, suffix)
  1866. def format_bytes(bytes):
  1867. return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
  1868. def lookup_unit_table(unit_table, s, strict=False):
  1869. num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
  1870. units_re = '|'.join(re.escape(u) for u in unit_table)
  1871. m = (re.fullmatch if strict else re.match)(
  1872. rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
  1873. if not m:
  1874. return None
  1875. num = float(m.group('num').replace(',', '.'))
  1876. mult = unit_table[m.group('unit')]
  1877. return round(num * mult)
  1878. def parse_bytes(s):
  1879. """Parse a string indicating a byte quantity into an integer"""
  1880. return lookup_unit_table(
  1881. {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
  1882. s.upper(), strict=True)
  1883. def parse_filesize(s):
  1884. if s is None:
  1885. return None
  1886. # The lower-case forms are of course incorrect and unofficial,
  1887. # but we support those too
  1888. _UNIT_TABLE = {
  1889. 'B': 1,
  1890. 'b': 1,
  1891. 'bytes': 1,
  1892. 'KiB': 1024,
  1893. 'KB': 1000,
  1894. 'kB': 1024,
  1895. 'Kb': 1000,
  1896. 'kb': 1000,
  1897. 'kilobytes': 1000,
  1898. 'kibibytes': 1024,
  1899. 'MiB': 1024 ** 2,
  1900. 'MB': 1000 ** 2,
  1901. 'mB': 1024 ** 2,
  1902. 'Mb': 1000 ** 2,
  1903. 'mb': 1000 ** 2,
  1904. 'megabytes': 1000 ** 2,
  1905. 'mebibytes': 1024 ** 2,
  1906. 'GiB': 1024 ** 3,
  1907. 'GB': 1000 ** 3,
  1908. 'gB': 1024 ** 3,
  1909. 'Gb': 1000 ** 3,
  1910. 'gb': 1000 ** 3,
  1911. 'gigabytes': 1000 ** 3,
  1912. 'gibibytes': 1024 ** 3,
  1913. 'TiB': 1024 ** 4,
  1914. 'TB': 1000 ** 4,
  1915. 'tB': 1024 ** 4,
  1916. 'Tb': 1000 ** 4,
  1917. 'tb': 1000 ** 4,
  1918. 'terabytes': 1000 ** 4,
  1919. 'tebibytes': 1024 ** 4,
  1920. 'PiB': 1024 ** 5,
  1921. 'PB': 1000 ** 5,
  1922. 'pB': 1024 ** 5,
  1923. 'Pb': 1000 ** 5,
  1924. 'pb': 1000 ** 5,
  1925. 'petabytes': 1000 ** 5,
  1926. 'pebibytes': 1024 ** 5,
  1927. 'EiB': 1024 ** 6,
  1928. 'EB': 1000 ** 6,
  1929. 'eB': 1024 ** 6,
  1930. 'Eb': 1000 ** 6,
  1931. 'eb': 1000 ** 6,
  1932. 'exabytes': 1000 ** 6,
  1933. 'exbibytes': 1024 ** 6,
  1934. 'ZiB': 1024 ** 7,
  1935. 'ZB': 1000 ** 7,
  1936. 'zB': 1024 ** 7,
  1937. 'Zb': 1000 ** 7,
  1938. 'zb': 1000 ** 7,
  1939. 'zettabytes': 1000 ** 7,
  1940. 'zebibytes': 1024 ** 7,
  1941. 'YiB': 1024 ** 8,
  1942. 'YB': 1000 ** 8,
  1943. 'yB': 1024 ** 8,
  1944. 'Yb': 1000 ** 8,
  1945. 'yb': 1000 ** 8,
  1946. 'yottabytes': 1000 ** 8,
  1947. 'yobibytes': 1024 ** 8,
  1948. }
  1949. return lookup_unit_table(_UNIT_TABLE, s)
  1950. def parse_count(s):
  1951. if s is None:
  1952. return None
  1953. s = re.sub(r'^[^\d]+\s', '', s).strip()
  1954. if re.match(r'^[\d,.]+$', s):
  1955. return str_to_int(s)
  1956. _UNIT_TABLE = {
  1957. 'k': 1000,
  1958. 'K': 1000,
  1959. 'm': 1000 ** 2,
  1960. 'M': 1000 ** 2,
  1961. 'kk': 1000 ** 2,
  1962. 'KK': 1000 ** 2,
  1963. 'b': 1000 ** 3,
  1964. 'B': 1000 ** 3,
  1965. }
  1966. ret = lookup_unit_table(_UNIT_TABLE, s)
  1967. if ret is not None:
  1968. return ret
  1969. mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
  1970. if mobj:
  1971. return str_to_int(mobj.group(1))
  1972. def parse_resolution(s, *, lenient=False):
  1973. if s is None:
  1974. return {}
  1975. if lenient:
  1976. mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
  1977. else:
  1978. mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
  1979. if mobj:
  1980. return {
  1981. 'width': int(mobj.group('w')),
  1982. 'height': int(mobj.group('h')),
  1983. }
  1984. mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
  1985. if mobj:
  1986. return {'height': int(mobj.group(1))}
  1987. mobj = re.search(r'\b([48])[kK]\b', s)
  1988. if mobj:
  1989. return {'height': int(mobj.group(1)) * 540}
  1990. return {}
  1991. def parse_bitrate(s):
  1992. if not isinstance(s, str):
  1993. return
  1994. mobj = re.search(r'\b(\d+)\s*kbps', s)
  1995. if mobj:
  1996. return int(mobj.group(1))
  1997. def month_by_name(name, lang='en'):
  1998. """ Return the number of a month by (locale-independently) English name """
  1999. month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
  2000. try:
  2001. return month_names.index(name) + 1
  2002. except ValueError:
  2003. return None
  2004. def month_by_abbreviation(abbrev):
  2005. """ Return the number of a month by (locale-independently) English
  2006. abbreviations """
  2007. try:
  2008. return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
  2009. except ValueError:
  2010. return None
  2011. def fix_xml_ampersands(xml_str):
  2012. """Replace all the '&' by '&amp;' in XML"""
  2013. return re.sub(
  2014. r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
  2015. '&amp;',
  2016. xml_str)
  2017. def setproctitle(title):
  2018. assert isinstance(title, str)
  2019. # Workaround for https://github.com/hypervideo/hypervideo/issues/4541
  2020. try:
  2021. import ctypes
  2022. except ImportError:
  2023. return
  2024. try:
  2025. libc = ctypes.cdll.LoadLibrary('libc.so.6')
  2026. except OSError:
  2027. return
  2028. except TypeError:
  2029. # LoadLibrary in Windows Python 2.7.13 only expects
  2030. # a bytestring, but since unicode_literals turns
  2031. # every string into a unicode string, it fails.
  2032. return
  2033. title_bytes = title.encode()
  2034. buf = ctypes.create_string_buffer(len(title_bytes))
  2035. buf.value = title_bytes
  2036. try:
  2037. libc.prctl(15, buf, 0, 0, 0)
  2038. except AttributeError:
  2039. return # Strange libc, just skip this
  2040. def remove_start(s, start):
  2041. return s[len(start):] if s is not None and s.startswith(start) else s
  2042. def remove_end(s, end):
  2043. return s[:-len(end)] if s is not None and s.endswith(end) else s
  2044. def remove_quotes(s):
  2045. if s is None or len(s) < 2:
  2046. return s
  2047. for quote in ('"', "'", ):
  2048. if s[0] == quote and s[-1] == quote:
  2049. return s[1:-1]
  2050. return s
  2051. def get_domain(url):
  2052. """
  2053. This implementation is inconsistent, but is kept for compatibility.
  2054. Use this only for "webpage_url_domain"
  2055. """
  2056. return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
  2057. def url_basename(url):
  2058. path = urllib.parse.urlparse(url).path
  2059. return path.strip('/').split('/')[-1]
  2060. def base_url(url):
  2061. return re.match(r'https?://[^?#]+/', url).group()
  2062. def urljoin(base, path):
  2063. if isinstance(path, bytes):
  2064. path = path.decode()
  2065. if not isinstance(path, str) or not path:
  2066. return None
  2067. if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
  2068. return path
  2069. if isinstance(base, bytes):
  2070. base = base.decode()
  2071. if not isinstance(base, str) or not re.match(
  2072. r'^(?:https?:)?//', base):
  2073. return None
  2074. return urllib.parse.urljoin(base, path)
  2075. class HEADRequest(urllib.request.Request):
  2076. def get_method(self):
  2077. return 'HEAD'
  2078. class PUTRequest(urllib.request.Request):
  2079. def get_method(self):
  2080. return 'PUT'
  2081. def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
  2082. if get_attr and v is not None:
  2083. v = getattr(v, get_attr, None)
  2084. try:
  2085. return int(v) * invscale // scale
  2086. except (ValueError, TypeError, OverflowError):
  2087. return default
  2088. def str_or_none(v, default=None):
  2089. return default if v is None else str(v)
  2090. def str_to_int(int_str):
  2091. """ A more relaxed version of int_or_none """
  2092. if isinstance(int_str, int):
  2093. return int_str
  2094. elif isinstance(int_str, str):
  2095. int_str = re.sub(r'[,\.\+]', '', int_str)
  2096. return int_or_none(int_str)
  2097. def float_or_none(v, scale=1, invscale=1, default=None):
  2098. if v is None:
  2099. return default
  2100. try:
  2101. return float(v) * invscale / scale
  2102. except (ValueError, TypeError):
  2103. return default
  2104. def bool_or_none(v, default=None):
  2105. return v if isinstance(v, bool) else default
  2106. def strip_or_none(v, default=None):
  2107. return v.strip() if isinstance(v, str) else default
  2108. def url_or_none(url):
  2109. if not url or not isinstance(url, str):
  2110. return None
  2111. url = url.strip()
  2112. return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
  2113. def request_to_url(req):
  2114. if isinstance(req, urllib.request.Request):
  2115. return req.get_full_url()
  2116. else:
  2117. return req
  2118. def strftime_or_none(timestamp, date_format, default=None):
  2119. datetime_object = None
  2120. try:
  2121. if isinstance(timestamp, (int, float)): # unix timestamp
  2122. # Using naive datetime here can break timestamp() in Windows
  2123. # Ref: https://github.com/hypervideo/hypervideo/issues/5185, https://github.com/python/cpython/issues/94414
  2124. datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
  2125. elif isinstance(timestamp, str): # assume YYYYMMDD
  2126. datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
  2127. date_format = re.sub( # Support %s on windows
  2128. r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
  2129. return datetime_object.strftime(date_format)
  2130. except (ValueError, TypeError, AttributeError):
  2131. return default
  2132. def parse_duration(s):
  2133. if not isinstance(s, str):
  2134. return None
  2135. s = s.strip()
  2136. if not s:
  2137. return None
  2138. days, hours, mins, secs, ms = [None] * 5
  2139. m = re.match(r'''(?x)
  2140. (?P<before_secs>
  2141. (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
  2142. (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
  2143. (?P<ms>[.:][0-9]+)?Z?$
  2144. ''', s)
  2145. if m:
  2146. days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
  2147. else:
  2148. m = re.match(
  2149. r'''(?ix)(?:P?
  2150. (?:
  2151. [0-9]+\s*y(?:ears?)?,?\s*
  2152. )?
  2153. (?:
  2154. [0-9]+\s*m(?:onths?)?,?\s*
  2155. )?
  2156. (?:
  2157. [0-9]+\s*w(?:eeks?)?,?\s*
  2158. )?
  2159. (?:
  2160. (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
  2161. )?
  2162. T)?
  2163. (?:
  2164. (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
  2165. )?
  2166. (?:
  2167. (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
  2168. )?
  2169. (?:
  2170. (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
  2171. )?Z?$''', s)
  2172. if m:
  2173. days, hours, mins, secs, ms = m.groups()
  2174. else:
  2175. m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
  2176. if m:
  2177. hours, mins = m.groups()
  2178. else:
  2179. return None
  2180. if ms:
  2181. ms = ms.replace(':', '.')
  2182. return sum(float(part or 0) * mult for part, mult in (
  2183. (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
  2184. def prepend_extension(filename, ext, expected_real_ext=None):
  2185. name, real_ext = os.path.splitext(filename)
  2186. return (
  2187. f'{name}.{ext}{real_ext}'
  2188. if not expected_real_ext or real_ext[1:] == expected_real_ext
  2189. else f'{filename}.{ext}')
  2190. def replace_extension(filename, ext, expected_real_ext=None):
  2191. name, real_ext = os.path.splitext(filename)
  2192. return '{}.{}'.format(
  2193. name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
  2194. ext)
  2195. def check_executable(exe, args=[]):
  2196. """ Checks if the given binary is installed somewhere in PATH, and returns its name.
  2197. args can be a list of arguments for a short output (like -version) """
  2198. try:
  2199. Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  2200. except OSError:
  2201. return False
  2202. return exe
  2203. def _get_exe_version_output(exe, args):
  2204. try:
  2205. # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
  2206. # SIGTTOU if hypervideo is run in the background.
  2207. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
  2208. stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
  2209. stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
  2210. except OSError:
  2211. return False
  2212. return stdout
  2213. def detect_exe_version(output, version_re=None, unrecognized='present'):
  2214. assert isinstance(output, str)
  2215. if version_re is None:
  2216. version_re = r'version\s+([-0-9._a-zA-Z]+)'
  2217. m = re.search(version_re, output)
  2218. if m:
  2219. return m.group(1)
  2220. else:
  2221. return unrecognized
  2222. def get_exe_version(exe, args=['--version'],
  2223. version_re=None, unrecognized='present'):
  2224. """ Returns the version of the specified executable,
  2225. or False if the executable is not present """
  2226. out = _get_exe_version_output(exe, args)
  2227. return detect_exe_version(out, version_re, unrecognized) if out else False
  2228. def frange(start=0, stop=None, step=1):
  2229. """Float range"""
  2230. if stop is None:
  2231. start, stop = 0, start
  2232. sign = [-1, 1][step > 0] if step else 0
  2233. while sign * start < sign * stop:
  2234. yield start
  2235. start += step
  2236. class LazyList(collections.abc.Sequence):
  2237. """Lazy immutable list from an iterable
  2238. Note that slices of a LazyList are lists and not LazyList"""
  2239. class IndexError(IndexError):
  2240. pass
  2241. def __init__(self, iterable, *, reverse=False, _cache=None):
  2242. self._iterable = iter(iterable)
  2243. self._cache = [] if _cache is None else _cache
  2244. self._reversed = reverse
  2245. def __iter__(self):
  2246. if self._reversed:
  2247. # We need to consume the entire iterable to iterate in reverse
  2248. yield from self.exhaust()
  2249. return
  2250. yield from self._cache
  2251. for item in self._iterable:
  2252. self._cache.append(item)
  2253. yield item
  2254. def _exhaust(self):
  2255. self._cache.extend(self._iterable)
  2256. self._iterable = [] # Discard the emptied iterable to make it pickle-able
  2257. return self._cache
  2258. def exhaust(self):
  2259. """Evaluate the entire iterable"""
  2260. return self._exhaust()[::-1 if self._reversed else 1]
  2261. @staticmethod
  2262. def _reverse_index(x):
  2263. return None if x is None else ~x
  2264. def __getitem__(self, idx):
  2265. if isinstance(idx, slice):
  2266. if self._reversed:
  2267. idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
  2268. start, stop, step = idx.start, idx.stop, idx.step or 1
  2269. elif isinstance(idx, int):
  2270. if self._reversed:
  2271. idx = self._reverse_index(idx)
  2272. start, stop, step = idx, idx, 0
  2273. else:
  2274. raise TypeError('indices must be integers or slices')
  2275. if ((start or 0) < 0 or (stop or 0) < 0
  2276. or (start is None and step < 0)
  2277. or (stop is None and step > 0)):
  2278. # We need to consume the entire iterable to be able to slice from the end
  2279. # Obviously, never use this with infinite iterables
  2280. self._exhaust()
  2281. try:
  2282. return self._cache[idx]
  2283. except IndexError as e:
  2284. raise self.IndexError(e) from e
  2285. n = max(start or 0, stop or 0) - len(self._cache) + 1
  2286. if n > 0:
  2287. self._cache.extend(itertools.islice(self._iterable, n))
  2288. try:
  2289. return self._cache[idx]
  2290. except IndexError as e:
  2291. raise self.IndexError(e) from e
  2292. def __bool__(self):
  2293. try:
  2294. self[-1] if self._reversed else self[0]
  2295. except self.IndexError:
  2296. return False
  2297. return True
  2298. def __len__(self):
  2299. self._exhaust()
  2300. return len(self._cache)
  2301. def __reversed__(self):
  2302. return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
  2303. def __copy__(self):
  2304. return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
  2305. def __repr__(self):
  2306. # repr and str should mimic a list. So we exhaust the iterable
  2307. return repr(self.exhaust())
  2308. def __str__(self):
  2309. return repr(self.exhaust())
  2310. class PagedList:
  2311. class IndexError(IndexError):
  2312. pass
  2313. def __len__(self):
  2314. # This is only useful for tests
  2315. return len(self.getslice())
  2316. def __init__(self, pagefunc, pagesize, use_cache=True):
  2317. self._pagefunc = pagefunc
  2318. self._pagesize = pagesize
  2319. self._pagecount = float('inf')
  2320. self._use_cache = use_cache
  2321. self._cache = {}
  2322. def getpage(self, pagenum):
  2323. page_results = self._cache.get(pagenum)
  2324. if page_results is None:
  2325. page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
  2326. if self._use_cache:
  2327. self._cache[pagenum] = page_results
  2328. return page_results
  2329. def getslice(self, start=0, end=None):
  2330. return list(self._getslice(start, end))
  2331. def _getslice(self, start, end):
  2332. raise NotImplementedError('This method must be implemented by subclasses')
  2333. def __getitem__(self, idx):
  2334. assert self._use_cache, 'Indexing PagedList requires cache'
  2335. if not isinstance(idx, int) or idx < 0:
  2336. raise TypeError('indices must be non-negative integers')
  2337. entries = self.getslice(idx, idx + 1)
  2338. if not entries:
  2339. raise self.IndexError()
  2340. return entries[0]
  2341. class OnDemandPagedList(PagedList):
  2342. """Download pages until a page with less than maximum results"""
  2343. def _getslice(self, start, end):
  2344. for pagenum in itertools.count(start // self._pagesize):
  2345. firstid = pagenum * self._pagesize
  2346. nextfirstid = pagenum * self._pagesize + self._pagesize
  2347. if start >= nextfirstid:
  2348. continue
  2349. startv = (
  2350. start % self._pagesize
  2351. if firstid <= start < nextfirstid
  2352. else 0)
  2353. endv = (
  2354. ((end - 1) % self._pagesize) + 1
  2355. if (end is not None and firstid <= end <= nextfirstid)
  2356. else None)
  2357. try:
  2358. page_results = self.getpage(pagenum)
  2359. except Exception:
  2360. self._pagecount = pagenum - 1
  2361. raise
  2362. if startv != 0 or endv is not None:
  2363. page_results = page_results[startv:endv]
  2364. yield from page_results
  2365. # A little optimization - if current page is not "full", ie. does
  2366. # not contain page_size videos then we can assume that this page
  2367. # is the last one - there are no more ids on further pages -
  2368. # i.e. no need to query again.
  2369. if len(page_results) + startv < self._pagesize:
  2370. break
  2371. # If we got the whole page, but the next page is not interesting,
  2372. # break out early as well
  2373. if end == nextfirstid:
  2374. break
  2375. class InAdvancePagedList(PagedList):
  2376. """PagedList with total number of pages known in advance"""
  2377. def __init__(self, pagefunc, pagecount, pagesize):
  2378. PagedList.__init__(self, pagefunc, pagesize, True)
  2379. self._pagecount = pagecount
  2380. def _getslice(self, start, end):
  2381. start_page = start // self._pagesize
  2382. end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
  2383. skip_elems = start - start_page * self._pagesize
  2384. only_more = None if end is None else end - start
  2385. for pagenum in range(start_page, end_page):
  2386. page_results = self.getpage(pagenum)
  2387. if skip_elems:
  2388. page_results = page_results[skip_elems:]
  2389. skip_elems = None
  2390. if only_more is not None:
  2391. if len(page_results) < only_more:
  2392. only_more -= len(page_results)
  2393. else:
  2394. yield from page_results[:only_more]
  2395. break
  2396. yield from page_results
  2397. class PlaylistEntries:
  2398. MissingEntry = object()
  2399. is_exhausted = False
  2400. def __init__(self, ydl, info_dict):
  2401. self.ydl = ydl
  2402. # _entries must be assigned now since infodict can change during iteration
  2403. entries = info_dict.get('entries')
  2404. if entries is None:
  2405. raise EntryNotInPlaylist('There are no entries')
  2406. elif isinstance(entries, list):
  2407. self.is_exhausted = True
  2408. requested_entries = info_dict.get('requested_entries')
  2409. self.is_incomplete = requested_entries is not None
  2410. if self.is_incomplete:
  2411. assert self.is_exhausted
  2412. self._entries = [self.MissingEntry] * max(requested_entries or [0])
  2413. for i, entry in zip(requested_entries, entries):
  2414. self._entries[i - 1] = entry
  2415. elif isinstance(entries, (list, PagedList, LazyList)):
  2416. self._entries = entries
  2417. else:
  2418. self._entries = LazyList(entries)
  2419. PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
  2420. (?P<start>[+-]?\d+)?
  2421. (?P<range>[:-]
  2422. (?P<end>[+-]?\d+|inf(?:inite)?)?
  2423. (?::(?P<step>[+-]?\d+))?
  2424. )?''')
  2425. @classmethod
  2426. def parse_playlist_items(cls, string):
  2427. for segment in string.split(','):
  2428. if not segment:
  2429. raise ValueError('There is two or more consecutive commas')
  2430. mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
  2431. if not mobj:
  2432. raise ValueError(f'{segment!r} is not a valid specification')
  2433. start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
  2434. if int_or_none(step) == 0:
  2435. raise ValueError(f'Step in {segment!r} cannot be zero')
  2436. yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
  2437. def get_requested_items(self):
  2438. playlist_items = self.ydl.params.get('playlist_items')
  2439. playlist_start = self.ydl.params.get('playliststart', 1)
  2440. playlist_end = self.ydl.params.get('playlistend')
  2441. # For backwards compatibility, interpret -1 as whole list
  2442. if playlist_end in (-1, None):
  2443. playlist_end = ''
  2444. if not playlist_items:
  2445. playlist_items = f'{playlist_start}:{playlist_end}'
  2446. elif playlist_start != 1 or playlist_end:
  2447. self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
  2448. for index in self.parse_playlist_items(playlist_items):
  2449. for i, entry in self[index]:
  2450. yield i, entry
  2451. if not entry:
  2452. continue
  2453. try:
  2454. # TODO: Add auto-generated fields
  2455. self.ydl._match_entry(entry, incomplete=True, silent=True)
  2456. except (ExistingVideoReached, RejectedVideoReached):
  2457. return
  2458. def get_full_count(self):
  2459. if self.is_exhausted and not self.is_incomplete:
  2460. return len(self)
  2461. elif isinstance(self._entries, InAdvancePagedList):
  2462. if self._entries._pagesize == 1:
  2463. return self._entries._pagecount
  2464. @functools.cached_property
  2465. def _getter(self):
  2466. if isinstance(self._entries, list):
  2467. def get_entry(i):
  2468. try:
  2469. entry = self._entries[i]
  2470. except IndexError:
  2471. entry = self.MissingEntry
  2472. if not self.is_incomplete:
  2473. raise self.IndexError()
  2474. if entry is self.MissingEntry:
  2475. raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
  2476. return entry
  2477. else:
  2478. def get_entry(i):
  2479. try:
  2480. return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
  2481. except (LazyList.IndexError, PagedList.IndexError):
  2482. raise self.IndexError()
  2483. return get_entry
  2484. def __getitem__(self, idx):
  2485. if isinstance(idx, int):
  2486. idx = slice(idx, idx)
  2487. # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
  2488. step = 1 if idx.step is None else idx.step
  2489. if idx.start is None:
  2490. start = 0 if step > 0 else len(self) - 1
  2491. else:
  2492. start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
  2493. # NB: Do not call len(self) when idx == [:]
  2494. if idx.stop is None:
  2495. stop = 0 if step < 0 else float('inf')
  2496. else:
  2497. stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
  2498. stop += [-1, 1][step > 0]
  2499. for i in frange(start, stop, step):
  2500. if i < 0:
  2501. continue
  2502. try:
  2503. entry = self._getter(i)
  2504. except self.IndexError:
  2505. self.is_exhausted = True
  2506. if step > 0:
  2507. break
  2508. continue
  2509. yield i + 1, entry
  2510. def __len__(self):
  2511. return len(tuple(self[:]))
  2512. class IndexError(IndexError):
  2513. pass
  2514. def uppercase_escape(s):
  2515. unicode_escape = codecs.getdecoder('unicode_escape')
  2516. return re.sub(
  2517. r'\\U[0-9a-fA-F]{8}',
  2518. lambda m: unicode_escape(m.group(0))[0],
  2519. s)
  2520. def lowercase_escape(s):
  2521. unicode_escape = codecs.getdecoder('unicode_escape')
  2522. return re.sub(
  2523. r'\\u[0-9a-fA-F]{4}',
  2524. lambda m: unicode_escape(m.group(0))[0],
  2525. s)
  2526. def escape_rfc3986(s):
  2527. """Escape non-ASCII characters as suggested by RFC 3986"""
  2528. return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
  2529. def escape_url(url):
  2530. """Escape URL as suggested by RFC 3986"""
  2531. url_parsed = urllib.parse.urlparse(url)
  2532. return url_parsed._replace(
  2533. netloc=url_parsed.netloc.encode('idna').decode('ascii'),
  2534. path=escape_rfc3986(url_parsed.path),
  2535. params=escape_rfc3986(url_parsed.params),
  2536. query=escape_rfc3986(url_parsed.query),
  2537. fragment=escape_rfc3986(url_parsed.fragment)
  2538. ).geturl()
  2539. def parse_qs(url, **kwargs):
  2540. return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
  2541. def read_batch_urls(batch_fd):
  2542. def fixup(url):
  2543. if not isinstance(url, str):
  2544. url = url.decode('utf-8', 'replace')
  2545. BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
  2546. for bom in BOM_UTF8:
  2547. if url.startswith(bom):
  2548. url = url[len(bom):]
  2549. url = url.lstrip()
  2550. if not url or url.startswith(('#', ';', ']')):
  2551. return False
  2552. # "#" cannot be stripped out since it is part of the URI
  2553. # However, it can be safely stripped out if following a whitespace
  2554. return re.split(r'\s#', url, 1)[0].rstrip()
  2555. with contextlib.closing(batch_fd) as fd:
  2556. return [url for url in map(fixup, fd) if url]
  2557. def urlencode_postdata(*args, **kargs):
  2558. return urllib.parse.urlencode(*args, **kargs).encode('ascii')
  2559. def update_url_query(url, query):
  2560. if not query:
  2561. return url
  2562. parsed_url = urllib.parse.urlparse(url)
  2563. qs = urllib.parse.parse_qs(parsed_url.query)
  2564. qs.update(query)
  2565. return urllib.parse.urlunparse(parsed_url._replace(
  2566. query=urllib.parse.urlencode(qs, True)))
  2567. def update_Request(req, url=None, data=None, headers=None, query=None):
  2568. req_headers = req.headers.copy()
  2569. req_headers.update(headers or {})
  2570. req_data = data or req.data
  2571. req_url = update_url_query(url or req.get_full_url(), query)
  2572. req_get_method = req.get_method()
  2573. if req_get_method == 'HEAD':
  2574. req_type = HEADRequest
  2575. elif req_get_method == 'PUT':
  2576. req_type = PUTRequest
  2577. else:
  2578. req_type = urllib.request.Request
  2579. new_req = req_type(
  2580. req_url, data=req_data, headers=req_headers,
  2581. origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
  2582. if hasattr(req, 'timeout'):
  2583. new_req.timeout = req.timeout
  2584. return new_req
  2585. def _multipart_encode_impl(data, boundary):
  2586. content_type = 'multipart/form-data; boundary=%s' % boundary
  2587. out = b''
  2588. for k, v in data.items():
  2589. out += b'--' + boundary.encode('ascii') + b'\r\n'
  2590. if isinstance(k, str):
  2591. k = k.encode()
  2592. if isinstance(v, str):
  2593. v = v.encode()
  2594. # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
  2595. # suggests sending UTF-8 directly. Firefox sends UTF-8, too
  2596. content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
  2597. if boundary.encode('ascii') in content:
  2598. raise ValueError('Boundary overlaps with data')
  2599. out += content
  2600. out += b'--' + boundary.encode('ascii') + b'--\r\n'
  2601. return out, content_type
  2602. def multipart_encode(data, boundary=None):
  2603. '''
  2604. Encode a dict to RFC 7578-compliant form-data
  2605. data:
  2606. A dict where keys and values can be either Unicode or bytes-like
  2607. objects.
  2608. boundary:
  2609. If specified a Unicode object, it's used as the boundary. Otherwise
  2610. a random boundary is generated.
  2611. Reference: https://tools.ietf.org/html/rfc7578
  2612. '''
  2613. has_specified_boundary = boundary is not None
  2614. while True:
  2615. if boundary is None:
  2616. boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
  2617. try:
  2618. out, content_type = _multipart_encode_impl(data, boundary)
  2619. break
  2620. except ValueError:
  2621. if has_specified_boundary:
  2622. raise
  2623. boundary = None
  2624. return out, content_type
  2625. def variadic(x, allowed_types=(str, bytes, dict)):
  2626. return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
  2627. def dict_get(d, key_or_keys, default=None, skip_false_values=True):
  2628. for val in map(d.get, variadic(key_or_keys)):
  2629. if val is not None and (val or not skip_false_values):
  2630. return val
  2631. return default
  2632. def try_call(*funcs, expected_type=None, args=[], kwargs={}):
  2633. for f in funcs:
  2634. try:
  2635. val = f(*args, **kwargs)
  2636. except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
  2637. pass
  2638. else:
  2639. if expected_type is None or isinstance(val, expected_type):
  2640. return val
  2641. def try_get(src, getter, expected_type=None):
  2642. return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
  2643. def filter_dict(dct, cndn=lambda _, v: v is not None):
  2644. return {k: v for k, v in dct.items() if cndn(k, v)}
  2645. def merge_dicts(*dicts):
  2646. merged = {}
  2647. for a_dict in dicts:
  2648. for k, v in a_dict.items():
  2649. if (v is not None and k not in merged
  2650. or isinstance(v, str) and merged[k] == ''):
  2651. merged[k] = v
  2652. return merged
  2653. def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
  2654. return string if isinstance(string, str) else str(string, encoding, errors)
  2655. US_RATINGS = {
  2656. 'G': 0,
  2657. 'PG': 10,
  2658. 'PG-13': 13,
  2659. 'R': 16,
  2660. 'NC': 18,
  2661. }
  2662. TV_PARENTAL_GUIDELINES = {
  2663. 'TV-Y': 0,
  2664. 'TV-Y7': 7,
  2665. 'TV-G': 0,
  2666. 'TV-PG': 0,
  2667. 'TV-14': 14,
  2668. 'TV-MA': 17,
  2669. }
  2670. def parse_age_limit(s):
  2671. # isinstance(False, int) is True. So type() must be used instead
  2672. if type(s) is int: # noqa: E721
  2673. return s if 0 <= s <= 21 else None
  2674. elif not isinstance(s, str):
  2675. return None
  2676. m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
  2677. if m:
  2678. return int(m.group('age'))
  2679. s = s.upper()
  2680. if s in US_RATINGS:
  2681. return US_RATINGS[s]
  2682. m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
  2683. if m:
  2684. return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
  2685. return None
  2686. def strip_jsonp(code):
  2687. return re.sub(
  2688. r'''(?sx)^
  2689. (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
  2690. (?:\s*&&\s*(?P=func_name))?
  2691. \s*\(\s*(?P<callback_data>.*)\);?
  2692. \s*?(?://[^\n]*)*$''',
  2693. r'\g<callback_data>', code)
  2694. def js_to_json(code, vars={}, *, strict=False):
  2695. # vars is a dict of var, val pairs to substitute
  2696. STRING_QUOTES = '\'"'
  2697. STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
  2698. COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
  2699. SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
  2700. INTEGER_TABLE = (
  2701. (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
  2702. (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
  2703. )
  2704. def process_escape(match):
  2705. JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
  2706. escape = match.group(1) or match.group(2)
  2707. return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
  2708. else R'\u00' if escape == 'x'
  2709. else '' if escape == '\n'
  2710. else escape)
  2711. def fix_kv(m):
  2712. v = m.group(0)
  2713. if v in ('true', 'false', 'null'):
  2714. return v
  2715. elif v in ('undefined', 'void 0'):
  2716. return 'null'
  2717. elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
  2718. return ''
  2719. if v[0] in STRING_QUOTES:
  2720. escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
  2721. return f'"{escaped}"'
  2722. for regex, base in INTEGER_TABLE:
  2723. im = re.match(regex, v)
  2724. if im:
  2725. i = int(im.group(1), base)
  2726. return f'"{i}":' if v.endswith(':') else str(i)
  2727. if v in vars:
  2728. return json.dumps(vars[v])
  2729. if not strict:
  2730. return f'"{v}"'
  2731. raise ValueError(f'Unknown value: {v}')
  2732. def create_map(mobj):
  2733. return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
  2734. code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
  2735. if not strict:
  2736. code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
  2737. code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
  2738. return re.sub(rf'''(?sx)
  2739. {STRING_RE}|
  2740. {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
  2741. void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
  2742. \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
  2743. [0-9]+(?={SKIP_RE}:)|
  2744. !+
  2745. ''', fix_kv, code)
  2746. def qualities(quality_ids):
  2747. """ Get a numeric quality value out of a list of possible values """
  2748. def q(qid):
  2749. try:
  2750. return quality_ids.index(qid)
  2751. except ValueError:
  2752. return -1
  2753. return q
  2754. POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
  2755. DEFAULT_OUTTMPL = {
  2756. 'default': '%(title)s [%(id)s].%(ext)s',
  2757. 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
  2758. }
  2759. OUTTMPL_TYPES = {
  2760. 'chapter': None,
  2761. 'subtitle': None,
  2762. 'thumbnail': None,
  2763. 'description': 'description',
  2764. 'annotation': 'annotations.xml',
  2765. 'infojson': 'info.json',
  2766. 'link': None,
  2767. 'pl_video': None,
  2768. 'pl_thumbnail': None,
  2769. 'pl_description': 'description',
  2770. 'pl_infojson': 'info.json',
  2771. }
  2772. # As of [1] format syntax is:
  2773. # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
  2774. # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
  2775. STR_FORMAT_RE_TMPL = r'''(?x)
  2776. (?<!%)(?P<prefix>(?:%%)*)
  2777. %
  2778. (?P<has_key>\((?P<key>{0})\))?
  2779. (?P<format>
  2780. (?P<conversion>[#0\-+ ]+)?
  2781. (?P<min_width>\d+)?
  2782. (?P<precision>\.\d+)?
  2783. (?P<len_mod>[hlL])? # unused in python
  2784. {1} # conversion type
  2785. )
  2786. '''
  2787. STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
  2788. def limit_length(s, length):
  2789. """ Add ellipses to overly long strings """
  2790. if s is None:
  2791. return None
  2792. ELLIPSES = '...'
  2793. if len(s) > length:
  2794. return s[:length - len(ELLIPSES)] + ELLIPSES
  2795. return s
  2796. def version_tuple(v):
  2797. return tuple(int(e) for e in re.split(r'[-.]', v))
  2798. def is_outdated_version(version, limit, assume_new=True):
  2799. if not version:
  2800. return not assume_new
  2801. try:
  2802. return version_tuple(version) < version_tuple(limit)
  2803. except ValueError:
  2804. return not assume_new
  2805. def ytdl_is_updateable():
  2806. """ Returns if hypervideo can be updated with -U """
  2807. from .update import is_non_updateable
  2808. return not is_non_updateable()
  2809. def args_to_str(args):
  2810. # Get a short string representation for a subprocess command
  2811. return ' '.join(compat_shlex_quote(a) for a in args)
  2812. def error_to_compat_str(err):
  2813. return str(err)
  2814. def error_to_str(err):
  2815. return f'{type(err).__name__}: {err}'
  2816. def mimetype2ext(mt):
  2817. if mt is None:
  2818. return None
  2819. mt, _, params = mt.partition(';')
  2820. mt = mt.strip()
  2821. FULL_MAP = {
  2822. 'audio/mp4': 'm4a',
  2823. # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
  2824. # it's the most popular one
  2825. 'audio/mpeg': 'mp3',
  2826. 'audio/x-wav': 'wav',
  2827. 'audio/wav': 'wav',
  2828. 'audio/wave': 'wav',
  2829. }
  2830. ext = FULL_MAP.get(mt)
  2831. if ext is not None:
  2832. return ext
  2833. SUBTYPE_MAP = {
  2834. '3gpp': '3gp',
  2835. 'smptett+xml': 'tt',
  2836. 'ttaf+xml': 'dfxp',
  2837. 'ttml+xml': 'ttml',
  2838. 'x-flv': 'flv',
  2839. 'x-mp4-fragmented': 'mp4',
  2840. 'x-ms-sami': 'sami',
  2841. 'x-ms-wmv': 'wmv',
  2842. 'mpegurl': 'm3u8',
  2843. 'x-mpegurl': 'm3u8',
  2844. 'vnd.apple.mpegurl': 'm3u8',
  2845. 'dash+xml': 'mpd',
  2846. 'f4m+xml': 'f4m',
  2847. 'hds+xml': 'f4m',
  2848. 'vnd.ms-sstr+xml': 'ism',
  2849. 'quicktime': 'mov',
  2850. 'mp2t': 'ts',
  2851. 'x-wav': 'wav',
  2852. 'filmstrip+json': 'fs',
  2853. 'svg+xml': 'svg',
  2854. }
  2855. _, _, subtype = mt.rpartition('/')
  2856. ext = SUBTYPE_MAP.get(subtype.lower())
  2857. if ext is not None:
  2858. return ext
  2859. SUFFIX_MAP = {
  2860. 'json': 'json',
  2861. 'xml': 'xml',
  2862. 'zip': 'zip',
  2863. 'gzip': 'gz',
  2864. }
  2865. _, _, suffix = subtype.partition('+')
  2866. ext = SUFFIX_MAP.get(suffix)
  2867. if ext is not None:
  2868. return ext
  2869. return subtype.replace('+', '.')
  2870. def ext2mimetype(ext_or_url):
  2871. if not ext_or_url:
  2872. return None
  2873. if '.' not in ext_or_url:
  2874. ext_or_url = f'file.{ext_or_url}'
  2875. return mimetypes.guess_type(ext_or_url)[0]
  2876. def parse_codecs(codecs_str):
  2877. # http://tools.ietf.org/html/rfc6381
  2878. if not codecs_str:
  2879. return {}
  2880. split_codecs = list(filter(None, map(
  2881. str.strip, codecs_str.strip().strip(',').split(','))))
  2882. vcodec, acodec, scodec, hdr = None, None, None, None
  2883. for full_codec in split_codecs:
  2884. parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
  2885. if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
  2886. 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
  2887. if vcodec:
  2888. continue
  2889. vcodec = full_codec
  2890. if parts[0] in ('dvh1', 'dvhe'):
  2891. hdr = 'DV'
  2892. elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
  2893. hdr = 'HDR10'
  2894. elif parts[:2] == ['vp9', '2']:
  2895. hdr = 'HDR10'
  2896. elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
  2897. 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
  2898. acodec = acodec or full_codec
  2899. elif parts[0] in ('stpp', 'wvtt'):
  2900. scodec = scodec or full_codec
  2901. else:
  2902. write_string(f'WARNING: Unknown codec {full_codec}\n')
  2903. if vcodec or acodec or scodec:
  2904. return {
  2905. 'vcodec': vcodec or 'none',
  2906. 'acodec': acodec or 'none',
  2907. 'dynamic_range': hdr,
  2908. **({'scodec': scodec} if scodec is not None else {}),
  2909. }
  2910. elif len(split_codecs) == 2:
  2911. return {
  2912. 'vcodec': split_codecs[0],
  2913. 'acodec': split_codecs[1],
  2914. }
  2915. return {}
  2916. def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
  2917. assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
  2918. allow_mkv = not preferences or 'mkv' in preferences
  2919. if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
  2920. return 'mkv' # TODO: any other format allows this?
  2921. # TODO: All codecs supported by parse_codecs isn't handled here
  2922. COMPATIBLE_CODECS = {
  2923. 'mp4': {
  2924. 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
  2925. 'h264', 'aacl', 'ec-3', # Set in ISM
  2926. },
  2927. 'webm': {
  2928. 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
  2929. 'vp9x', 'vp8x', # in the webm spec
  2930. },
  2931. }
  2932. sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
  2933. vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
  2934. for ext in preferences or COMPATIBLE_CODECS.keys():
  2935. codec_set = COMPATIBLE_CODECS.get(ext, set())
  2936. if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
  2937. return ext
  2938. COMPATIBLE_EXTS = (
  2939. {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
  2940. {'webm'},
  2941. )
  2942. for ext in preferences or vexts:
  2943. current_exts = {ext, *vexts, *aexts}
  2944. if ext == 'mkv' or current_exts == {ext} or any(
  2945. ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
  2946. return ext
  2947. return 'mkv' if allow_mkv else preferences[-1]
  2948. def urlhandle_detect_ext(url_handle):
  2949. getheader = url_handle.headers.get
  2950. cd = getheader('Content-Disposition')
  2951. if cd:
  2952. m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
  2953. if m:
  2954. e = determine_ext(m.group('filename'), default_ext=None)
  2955. if e:
  2956. return e
  2957. return mimetype2ext(getheader('Content-Type'))
  2958. def encode_data_uri(data, mime_type):
  2959. return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
  2960. def age_restricted(content_limit, age_limit):
  2961. """ Returns True iff the content should be blocked """
  2962. if age_limit is None: # No limit set
  2963. return False
  2964. if content_limit is None:
  2965. return False # Content available for everyone
  2966. return age_limit < content_limit
  2967. # List of known byte-order-marks (BOM)
  2968. BOMS = [
  2969. (b'\xef\xbb\xbf', 'utf-8'),
  2970. (b'\x00\x00\xfe\xff', 'utf-32-be'),
  2971. (b'\xff\xfe\x00\x00', 'utf-32-le'),
  2972. (b'\xff\xfe', 'utf-16-le'),
  2973. (b'\xfe\xff', 'utf-16-be'),
  2974. ]
  2975. def is_html(first_bytes):
  2976. """ Detect whether a file contains HTML by examining its first bytes. """
  2977. encoding = 'utf-8'
  2978. for bom, enc in BOMS:
  2979. while first_bytes.startswith(bom):
  2980. encoding, first_bytes = enc, first_bytes[len(bom):]
  2981. return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
  2982. def determine_protocol(info_dict):
  2983. protocol = info_dict.get('protocol')
  2984. if protocol is not None:
  2985. return protocol
  2986. url = sanitize_url(info_dict['url'])
  2987. if url.startswith('rtmp'):
  2988. return 'rtmp'
  2989. elif url.startswith('mms'):
  2990. return 'mms'
  2991. elif url.startswith('rtsp'):
  2992. return 'rtsp'
  2993. ext = determine_ext(url)
  2994. if ext == 'm3u8':
  2995. return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
  2996. elif ext == 'f4m':
  2997. return 'f4m'
  2998. return urllib.parse.urlparse(url).scheme
  2999. def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
  3000. """ Render a list of rows, each as a list of values.
  3001. Text after a \t will be right aligned """
  3002. def width(string):
  3003. return len(remove_terminal_sequences(string).replace('\t', ''))
  3004. def get_max_lens(table):
  3005. return [max(width(str(v)) for v in col) for col in zip(*table)]
  3006. def filter_using_list(row, filterArray):
  3007. return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
  3008. max_lens = get_max_lens(data) if hide_empty else []
  3009. header_row = filter_using_list(header_row, max_lens)
  3010. data = [filter_using_list(row, max_lens) for row in data]
  3011. table = [header_row] + data
  3012. max_lens = get_max_lens(table)
  3013. extra_gap += 1
  3014. if delim:
  3015. table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
  3016. table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
  3017. for row in table:
  3018. for pos, text in enumerate(map(str, row)):
  3019. if '\t' in text:
  3020. row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
  3021. else:
  3022. row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
  3023. ret = '\n'.join(''.join(row).rstrip() for row in table)
  3024. return ret
  3025. def _match_one(filter_part, dct, incomplete):
  3026. # TODO: Generalize code with YoutubeDL._build_format_filter
  3027. STRING_OPERATORS = {
  3028. '*=': operator.contains,
  3029. '^=': lambda attr, value: attr.startswith(value),
  3030. '$=': lambda attr, value: attr.endswith(value),
  3031. '~=': lambda attr, value: re.search(value, attr),
  3032. }
  3033. COMPARISON_OPERATORS = {
  3034. **STRING_OPERATORS,
  3035. '<=': operator.le, # "<=" must be defined above "<"
  3036. '<': operator.lt,
  3037. '>=': operator.ge,
  3038. '>': operator.gt,
  3039. '=': operator.eq,
  3040. }
  3041. if isinstance(incomplete, bool):
  3042. is_incomplete = lambda _: incomplete
  3043. else:
  3044. is_incomplete = lambda k: k in incomplete
  3045. operator_rex = re.compile(r'''(?x)
  3046. (?P<key>[a-z_]+)
  3047. \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
  3048. (?:
  3049. (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
  3050. (?P<strval>.+?)
  3051. )
  3052. ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
  3053. m = operator_rex.fullmatch(filter_part.strip())
  3054. if m:
  3055. m = m.groupdict()
  3056. unnegated_op = COMPARISON_OPERATORS[m['op']]
  3057. if m['negation']:
  3058. op = lambda attr, value: not unnegated_op(attr, value)
  3059. else:
  3060. op = unnegated_op
  3061. comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
  3062. if m['quote']:
  3063. comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
  3064. actual_value = dct.get(m['key'])
  3065. numeric_comparison = None
  3066. if isinstance(actual_value, (int, float)):
  3067. # If the original field is a string and matching comparisonvalue is
  3068. # a number we should respect the origin of the original field
  3069. # and process comparison value as a string (see
  3070. # https://github.com/ytdl-org/youtube-dl/issues/11082)
  3071. try:
  3072. numeric_comparison = int(comparison_value)
  3073. except ValueError:
  3074. numeric_comparison = parse_filesize(comparison_value)
  3075. if numeric_comparison is None:
  3076. numeric_comparison = parse_filesize(f'{comparison_value}B')
  3077. if numeric_comparison is None:
  3078. numeric_comparison = parse_duration(comparison_value)
  3079. if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
  3080. raise ValueError('Operator %s only supports string values!' % m['op'])
  3081. if actual_value is None:
  3082. return is_incomplete(m['key']) or m['none_inclusive']
  3083. return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
  3084. UNARY_OPERATORS = {
  3085. '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
  3086. '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
  3087. }
  3088. operator_rex = re.compile(r'''(?x)
  3089. (?P<op>%s)\s*(?P<key>[a-z_]+)
  3090. ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
  3091. m = operator_rex.fullmatch(filter_part.strip())
  3092. if m:
  3093. op = UNARY_OPERATORS[m.group('op')]
  3094. actual_value = dct.get(m.group('key'))
  3095. if is_incomplete(m.group('key')) and actual_value is None:
  3096. return True
  3097. return op(actual_value)
  3098. raise ValueError('Invalid filter part %r' % filter_part)
  3099. def match_str(filter_str, dct, incomplete=False):
  3100. """ Filter a dictionary with a simple string syntax.
  3101. @returns Whether the filter passes
  3102. @param incomplete Set of keys that is expected to be missing from dct.
  3103. Can be True/False to indicate all/none of the keys may be missing.
  3104. All conditions on incomplete keys pass if the key is missing
  3105. """
  3106. return all(
  3107. _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
  3108. for filter_part in re.split(r'(?<!\\)&', filter_str))
  3109. def match_filter_func(filters):
  3110. if not filters:
  3111. return None
  3112. filters = set(variadic(filters))
  3113. interactive = '-' in filters
  3114. if interactive:
  3115. filters.remove('-')
  3116. def _match_func(info_dict, incomplete=False):
  3117. if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
  3118. return NO_DEFAULT if interactive and not incomplete else None
  3119. else:
  3120. video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
  3121. filter_str = ') | ('.join(map(str.strip, filters))
  3122. return f'{video_title} does not pass filter ({filter_str}), skipping ..'
  3123. return _match_func
  3124. class download_range_func:
  3125. def __init__(self, chapters, ranges):
  3126. self.chapters, self.ranges = chapters, ranges
  3127. def __call__(self, info_dict, ydl):
  3128. if not self.ranges and not self.chapters:
  3129. yield {}
  3130. warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
  3131. else 'Cannot match chapters since chapter information is unavailable')
  3132. for regex in self.chapters or []:
  3133. for i, chapter in enumerate(info_dict.get('chapters') or []):
  3134. if re.search(regex, chapter['title']):
  3135. warning = None
  3136. yield {**chapter, 'index': i}
  3137. if self.chapters and warning:
  3138. ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
  3139. yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
  3140. def __eq__(self, other):
  3141. return (isinstance(other, download_range_func)
  3142. and self.chapters == other.chapters and self.ranges == other.ranges)
  3143. def parse_dfxp_time_expr(time_expr):
  3144. if not time_expr:
  3145. return
  3146. mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
  3147. if mobj:
  3148. return float(mobj.group('time_offset'))
  3149. mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
  3150. if mobj:
  3151. return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
  3152. def srt_subtitles_timecode(seconds):
  3153. return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
  3154. def ass_subtitles_timecode(seconds):
  3155. time = timetuple_from_msec(seconds * 1000)
  3156. return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
  3157. def dfxp2srt(dfxp_data):
  3158. '''
  3159. @param dfxp_data A bytes-like object containing DFXP data
  3160. @returns A unicode object containing converted SRT data
  3161. '''
  3162. LEGACY_NAMESPACES = (
  3163. (b'http://www.w3.org/ns/ttml', [
  3164. b'http://www.w3.org/2004/11/ttaf1',
  3165. b'http://www.w3.org/2006/04/ttaf1',
  3166. b'http://www.w3.org/2006/10/ttaf1',
  3167. ]),
  3168. (b'http://www.w3.org/ns/ttml#styling', [
  3169. b'http://www.w3.org/ns/ttml#style',
  3170. ]),
  3171. )
  3172. SUPPORTED_STYLING = [
  3173. 'color',
  3174. 'fontFamily',
  3175. 'fontSize',
  3176. 'fontStyle',
  3177. 'fontWeight',
  3178. 'textDecoration'
  3179. ]
  3180. _x = functools.partial(xpath_with_ns, ns_map={
  3181. 'xml': 'http://www.w3.org/XML/1998/namespace',
  3182. 'ttml': 'http://www.w3.org/ns/ttml',
  3183. 'tts': 'http://www.w3.org/ns/ttml#styling',
  3184. })
  3185. styles = {}
  3186. default_style = {}
  3187. class TTMLPElementParser:
  3188. _out = ''
  3189. _unclosed_elements = []
  3190. _applied_styles = []
  3191. def start(self, tag, attrib):
  3192. if tag in (_x('ttml:br'), 'br'):
  3193. self._out += '\n'
  3194. else:
  3195. unclosed_elements = []
  3196. style = {}
  3197. element_style_id = attrib.get('style')
  3198. if default_style:
  3199. style.update(default_style)
  3200. if element_style_id:
  3201. style.update(styles.get(element_style_id, {}))
  3202. for prop in SUPPORTED_STYLING:
  3203. prop_val = attrib.get(_x('tts:' + prop))
  3204. if prop_val:
  3205. style[prop] = prop_val
  3206. if style:
  3207. font = ''
  3208. for k, v in sorted(style.items()):
  3209. if self._applied_styles and self._applied_styles[-1].get(k) == v:
  3210. continue
  3211. if k == 'color':
  3212. font += ' color="%s"' % v
  3213. elif k == 'fontSize':
  3214. font += ' size="%s"' % v
  3215. elif k == 'fontFamily':
  3216. font += ' face="%s"' % v
  3217. elif k == 'fontWeight' and v == 'bold':
  3218. self._out += '<b>'
  3219. unclosed_elements.append('b')
  3220. elif k == 'fontStyle' and v == 'italic':
  3221. self._out += '<i>'
  3222. unclosed_elements.append('i')
  3223. elif k == 'textDecoration' and v == 'underline':
  3224. self._out += '<u>'
  3225. unclosed_elements.append('u')
  3226. if font:
  3227. self._out += '<font' + font + '>'
  3228. unclosed_elements.append('font')
  3229. applied_style = {}
  3230. if self._applied_styles:
  3231. applied_style.update(self._applied_styles[-1])
  3232. applied_style.update(style)
  3233. self._applied_styles.append(applied_style)
  3234. self._unclosed_elements.append(unclosed_elements)
  3235. def end(self, tag):
  3236. if tag not in (_x('ttml:br'), 'br'):
  3237. unclosed_elements = self._unclosed_elements.pop()
  3238. for element in reversed(unclosed_elements):
  3239. self._out += '</%s>' % element
  3240. if unclosed_elements and self._applied_styles:
  3241. self._applied_styles.pop()
  3242. def data(self, data):
  3243. self._out += data
  3244. def close(self):
  3245. return self._out.strip()
  3246. def parse_node(node):
  3247. target = TTMLPElementParser()
  3248. parser = xml.etree.ElementTree.XMLParser(target=target)
  3249. parser.feed(xml.etree.ElementTree.tostring(node))
  3250. return parser.close()
  3251. for k, v in LEGACY_NAMESPACES:
  3252. for ns in v:
  3253. dfxp_data = dfxp_data.replace(ns, k)
  3254. dfxp = compat_etree_fromstring(dfxp_data)
  3255. out = []
  3256. paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
  3257. if not paras:
  3258. raise ValueError('Invalid dfxp/TTML subtitle')
  3259. repeat = False
  3260. while True:
  3261. for style in dfxp.findall(_x('.//ttml:style')):
  3262. style_id = style.get('id') or style.get(_x('xml:id'))
  3263. if not style_id:
  3264. continue
  3265. parent_style_id = style.get('style')
  3266. if parent_style_id:
  3267. if parent_style_id not in styles:
  3268. repeat = True
  3269. continue
  3270. styles[style_id] = styles[parent_style_id].copy()
  3271. for prop in SUPPORTED_STYLING:
  3272. prop_val = style.get(_x('tts:' + prop))
  3273. if prop_val:
  3274. styles.setdefault(style_id, {})[prop] = prop_val
  3275. if repeat:
  3276. repeat = False
  3277. else:
  3278. break
  3279. for p in ('body', 'div'):
  3280. ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
  3281. if ele is None:
  3282. continue
  3283. style = styles.get(ele.get('style'))
  3284. if not style:
  3285. continue
  3286. default_style.update(style)
  3287. for para, index in zip(paras, itertools.count(1)):
  3288. begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
  3289. end_time = parse_dfxp_time_expr(para.attrib.get('end'))
  3290. dur = parse_dfxp_time_expr(para.attrib.get('dur'))
  3291. if begin_time is None:
  3292. continue
  3293. if not end_time:
  3294. if not dur:
  3295. continue
  3296. end_time = begin_time + dur
  3297. out.append('%d\n%s --> %s\n%s\n\n' % (
  3298. index,
  3299. srt_subtitles_timecode(begin_time),
  3300. srt_subtitles_timecode(end_time),
  3301. parse_node(para)))
  3302. return ''.join(out)
  3303. def cli_option(params, command_option, param, separator=None):
  3304. param = params.get(param)
  3305. return ([] if param is None
  3306. else [command_option, str(param)] if separator is None
  3307. else [f'{command_option}{separator}{param}'])
  3308. def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
  3309. param = params.get(param)
  3310. assert param in (True, False, None)
  3311. return cli_option({True: true_value, False: false_value}, command_option, param, separator)
  3312. def cli_valueless_option(params, command_option, param, expected_value=True):
  3313. return [command_option] if params.get(param) == expected_value else []
  3314. def cli_configuration_args(argdict, keys, default=[], use_compat=True):
  3315. if isinstance(argdict, (list, tuple)): # for backward compatibility
  3316. if use_compat:
  3317. return argdict
  3318. else:
  3319. argdict = None
  3320. if argdict is None:
  3321. return default
  3322. assert isinstance(argdict, dict)
  3323. assert isinstance(keys, (list, tuple))
  3324. for key_list in keys:
  3325. arg_list = list(filter(
  3326. lambda x: x is not None,
  3327. [argdict.get(key.lower()) for key in variadic(key_list)]))
  3328. if arg_list:
  3329. return [arg for args in arg_list for arg in args]
  3330. return default
  3331. def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
  3332. main_key, exe = main_key.lower(), exe.lower()
  3333. root_key = exe if main_key == exe else f'{main_key}+{exe}'
  3334. keys = [f'{root_key}{k}' for k in (keys or [''])]
  3335. if root_key in keys:
  3336. if main_key != exe:
  3337. keys.append((main_key, exe))
  3338. keys.append('default')
  3339. else:
  3340. use_compat = False
  3341. return cli_configuration_args(argdict, keys, default, use_compat)
  3342. class ISO639Utils:
  3343. # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
  3344. _lang_map = {
  3345. 'aa': 'aar',
  3346. 'ab': 'abk',
  3347. 'ae': 'ave',
  3348. 'af': 'afr',
  3349. 'ak': 'aka',
  3350. 'am': 'amh',
  3351. 'an': 'arg',
  3352. 'ar': 'ara',
  3353. 'as': 'asm',
  3354. 'av': 'ava',
  3355. 'ay': 'aym',
  3356. 'az': 'aze',
  3357. 'ba': 'bak',
  3358. 'be': 'bel',
  3359. 'bg': 'bul',
  3360. 'bh': 'bih',
  3361. 'bi': 'bis',
  3362. 'bm': 'bam',
  3363. 'bn': 'ben',
  3364. 'bo': 'bod',
  3365. 'br': 'bre',
  3366. 'bs': 'bos',
  3367. 'ca': 'cat',
  3368. 'ce': 'che',
  3369. 'ch': 'cha',
  3370. 'co': 'cos',
  3371. 'cr': 'cre',
  3372. 'cs': 'ces',
  3373. 'cu': 'chu',
  3374. 'cv': 'chv',
  3375. 'cy': 'cym',
  3376. 'da': 'dan',
  3377. 'de': 'deu',
  3378. 'dv': 'div',
  3379. 'dz': 'dzo',
  3380. 'ee': 'ewe',
  3381. 'el': 'ell',
  3382. 'en': 'eng',
  3383. 'eo': 'epo',
  3384. 'es': 'spa',
  3385. 'et': 'est',
  3386. 'eu': 'eus',
  3387. 'fa': 'fas',
  3388. 'ff': 'ful',
  3389. 'fi': 'fin',
  3390. 'fj': 'fij',
  3391. 'fo': 'fao',
  3392. 'fr': 'fra',
  3393. 'fy': 'fry',
  3394. 'ga': 'gle',
  3395. 'gd': 'gla',
  3396. 'gl': 'glg',
  3397. 'gn': 'grn',
  3398. 'gu': 'guj',
  3399. 'gv': 'glv',
  3400. 'ha': 'hau',
  3401. 'he': 'heb',
  3402. 'iw': 'heb', # Replaced by he in 1989 revision
  3403. 'hi': 'hin',
  3404. 'ho': 'hmo',
  3405. 'hr': 'hrv',
  3406. 'ht': 'hat',
  3407. 'hu': 'hun',
  3408. 'hy': 'hye',
  3409. 'hz': 'her',
  3410. 'ia': 'ina',
  3411. 'id': 'ind',
  3412. 'in': 'ind', # Replaced by id in 1989 revision
  3413. 'ie': 'ile',
  3414. 'ig': 'ibo',
  3415. 'ii': 'iii',
  3416. 'ik': 'ipk',
  3417. 'io': 'ido',
  3418. 'is': 'isl',
  3419. 'it': 'ita',
  3420. 'iu': 'iku',
  3421. 'ja': 'jpn',
  3422. 'jv': 'jav',
  3423. 'ka': 'kat',
  3424. 'kg': 'kon',
  3425. 'ki': 'kik',
  3426. 'kj': 'kua',
  3427. 'kk': 'kaz',
  3428. 'kl': 'kal',
  3429. 'km': 'khm',
  3430. 'kn': 'kan',
  3431. 'ko': 'kor',
  3432. 'kr': 'kau',
  3433. 'ks': 'kas',
  3434. 'ku': 'kur',
  3435. 'kv': 'kom',
  3436. 'kw': 'cor',
  3437. 'ky': 'kir',
  3438. 'la': 'lat',
  3439. 'lb': 'ltz',
  3440. 'lg': 'lug',
  3441. 'li': 'lim',
  3442. 'ln': 'lin',
  3443. 'lo': 'lao',
  3444. 'lt': 'lit',
  3445. 'lu': 'lub',
  3446. 'lv': 'lav',
  3447. 'mg': 'mlg',
  3448. 'mh': 'mah',
  3449. 'mi': 'mri',
  3450. 'mk': 'mkd',
  3451. 'ml': 'mal',
  3452. 'mn': 'mon',
  3453. 'mr': 'mar',
  3454. 'ms': 'msa',
  3455. 'mt': 'mlt',
  3456. 'my': 'mya',
  3457. 'na': 'nau',
  3458. 'nb': 'nob',
  3459. 'nd': 'nde',
  3460. 'ne': 'nep',
  3461. 'ng': 'ndo',
  3462. 'nl': 'nld',
  3463. 'nn': 'nno',
  3464. 'no': 'nor',
  3465. 'nr': 'nbl',
  3466. 'nv': 'nav',
  3467. 'ny': 'nya',
  3468. 'oc': 'oci',
  3469. 'oj': 'oji',
  3470. 'om': 'orm',
  3471. 'or': 'ori',
  3472. 'os': 'oss',
  3473. 'pa': 'pan',
  3474. 'pi': 'pli',
  3475. 'pl': 'pol',
  3476. 'ps': 'pus',
  3477. 'pt': 'por',
  3478. 'qu': 'que',
  3479. 'rm': 'roh',
  3480. 'rn': 'run',
  3481. 'ro': 'ron',
  3482. 'ru': 'rus',
  3483. 'rw': 'kin',
  3484. 'sa': 'san',
  3485. 'sc': 'srd',
  3486. 'sd': 'snd',
  3487. 'se': 'sme',
  3488. 'sg': 'sag',
  3489. 'si': 'sin',
  3490. 'sk': 'slk',
  3491. 'sl': 'slv',
  3492. 'sm': 'smo',
  3493. 'sn': 'sna',
  3494. 'so': 'som',
  3495. 'sq': 'sqi',
  3496. 'sr': 'srp',
  3497. 'ss': 'ssw',
  3498. 'st': 'sot',
  3499. 'su': 'sun',
  3500. 'sv': 'swe',
  3501. 'sw': 'swa',
  3502. 'ta': 'tam',
  3503. 'te': 'tel',
  3504. 'tg': 'tgk',
  3505. 'th': 'tha',
  3506. 'ti': 'tir',
  3507. 'tk': 'tuk',
  3508. 'tl': 'tgl',
  3509. 'tn': 'tsn',
  3510. 'to': 'ton',
  3511. 'tr': 'tur',
  3512. 'ts': 'tso',
  3513. 'tt': 'tat',
  3514. 'tw': 'twi',
  3515. 'ty': 'tah',
  3516. 'ug': 'uig',
  3517. 'uk': 'ukr',
  3518. 'ur': 'urd',
  3519. 'uz': 'uzb',
  3520. 've': 'ven',
  3521. 'vi': 'vie',
  3522. 'vo': 'vol',
  3523. 'wa': 'wln',
  3524. 'wo': 'wol',
  3525. 'xh': 'xho',
  3526. 'yi': 'yid',
  3527. 'ji': 'yid', # Replaced by yi in 1989 revision
  3528. 'yo': 'yor',
  3529. 'za': 'zha',
  3530. 'zh': 'zho',
  3531. 'zu': 'zul',
  3532. }
  3533. @classmethod
  3534. def short2long(cls, code):
  3535. """Convert language code from ISO 639-1 to ISO 639-2/T"""
  3536. return cls._lang_map.get(code[:2])
  3537. @classmethod
  3538. def long2short(cls, code):
  3539. """Convert language code from ISO 639-2/T to ISO 639-1"""
  3540. for short_name, long_name in cls._lang_map.items():
  3541. if long_name == code:
  3542. return short_name
  3543. class ISO3166Utils:
  3544. # From http://data.okfn.org/data/core/country-list
  3545. _country_map = {
  3546. 'AF': 'Afghanistan',
  3547. 'AX': 'Åland Islands',
  3548. 'AL': 'Albania',
  3549. 'DZ': 'Algeria',
  3550. 'AS': 'American Samoa',
  3551. 'AD': 'Andorra',
  3552. 'AO': 'Angola',
  3553. 'AI': 'Anguilla',
  3554. 'AQ': 'Antarctica',
  3555. 'AG': 'Antigua and Barbuda',
  3556. 'AR': 'Argentina',
  3557. 'AM': 'Armenia',
  3558. 'AW': 'Aruba',
  3559. 'AU': 'Australia',
  3560. 'AT': 'Austria',
  3561. 'AZ': 'Azerbaijan',
  3562. 'BS': 'Bahamas',
  3563. 'BH': 'Bahrain',
  3564. 'BD': 'Bangladesh',
  3565. 'BB': 'Barbados',
  3566. 'BY': 'Belarus',
  3567. 'BE': 'Belgium',
  3568. 'BZ': 'Belize',
  3569. 'BJ': 'Benin',
  3570. 'BM': 'Bermuda',
  3571. 'BT': 'Bhutan',
  3572. 'BO': 'Bolivia, Plurinational State of',
  3573. 'BQ': 'Bonaire, Sint Eustatius and Saba',
  3574. 'BA': 'Bosnia and Herzegovina',
  3575. 'BW': 'Botswana',
  3576. 'BV': 'Bouvet Island',
  3577. 'BR': 'Brazil',
  3578. 'IO': 'British Indian Ocean Territory',
  3579. 'BN': 'Brunei Darussalam',
  3580. 'BG': 'Bulgaria',
  3581. 'BF': 'Burkina Faso',
  3582. 'BI': 'Burundi',
  3583. 'KH': 'Cambodia',
  3584. 'CM': 'Cameroon',
  3585. 'CA': 'Canada',
  3586. 'CV': 'Cape Verde',
  3587. 'KY': 'Cayman Islands',
  3588. 'CF': 'Central African Republic',
  3589. 'TD': 'Chad',
  3590. 'CL': 'Chile',
  3591. 'CN': 'China',
  3592. 'CX': 'Christmas Island',
  3593. 'CC': 'Cocos (Keeling) Islands',
  3594. 'CO': 'Colombia',
  3595. 'KM': 'Comoros',
  3596. 'CG': 'Congo',
  3597. 'CD': 'Congo, the Democratic Republic of the',
  3598. 'CK': 'Cook Islands',
  3599. 'CR': 'Costa Rica',
  3600. 'CI': 'Côte d\'Ivoire',
  3601. 'HR': 'Croatia',
  3602. 'CU': 'Cuba',
  3603. 'CW': 'Curaçao',
  3604. 'CY': 'Cyprus',
  3605. 'CZ': 'Czech Republic',
  3606. 'DK': 'Denmark',
  3607. 'DJ': 'Djibouti',
  3608. 'DM': 'Dominica',
  3609. 'DO': 'Dominican Republic',
  3610. 'EC': 'Ecuador',
  3611. 'EG': 'Egypt',
  3612. 'SV': 'El Salvador',
  3613. 'GQ': 'Equatorial Guinea',
  3614. 'ER': 'Eritrea',
  3615. 'EE': 'Estonia',
  3616. 'ET': 'Ethiopia',
  3617. 'FK': 'Falkland Islands (Malvinas)',
  3618. 'FO': 'Faroe Islands',
  3619. 'FJ': 'Fiji',
  3620. 'FI': 'Finland',
  3621. 'FR': 'France',
  3622. 'GF': 'French Guiana',
  3623. 'PF': 'French Polynesia',
  3624. 'TF': 'French Southern Territories',
  3625. 'GA': 'Gabon',
  3626. 'GM': 'Gambia',
  3627. 'GE': 'Georgia',
  3628. 'DE': 'Germany',
  3629. 'GH': 'Ghana',
  3630. 'GI': 'Gibraltar',
  3631. 'GR': 'Greece',
  3632. 'GL': 'Greenland',
  3633. 'GD': 'Grenada',
  3634. 'GP': 'Guadeloupe',
  3635. 'GU': 'Guam',
  3636. 'GT': 'Guatemala',
  3637. 'GG': 'Guernsey',
  3638. 'GN': 'Guinea',
  3639. 'GW': 'Guinea-Bissau',
  3640. 'GY': 'Guyana',
  3641. 'HT': 'Haiti',
  3642. 'HM': 'Heard Island and McDonald Islands',
  3643. 'VA': 'Holy See (Vatican City State)',
  3644. 'HN': 'Honduras',
  3645. 'HK': 'Hong Kong',
  3646. 'HU': 'Hungary',
  3647. 'IS': 'Iceland',
  3648. 'IN': 'India',
  3649. 'ID': 'Indonesia',
  3650. 'IR': 'Iran, Islamic Republic of',
  3651. 'IQ': 'Iraq',
  3652. 'IE': 'Ireland',
  3653. 'IM': 'Isle of Man',
  3654. 'IL': 'Israel',
  3655. 'IT': 'Italy',
  3656. 'JM': 'Jamaica',
  3657. 'JP': 'Japan',
  3658. 'JE': 'Jersey',
  3659. 'JO': 'Jordan',
  3660. 'KZ': 'Kazakhstan',
  3661. 'KE': 'Kenya',
  3662. 'KI': 'Kiribati',
  3663. 'KP': 'Korea, Democratic People\'s Republic of',
  3664. 'KR': 'Korea, Republic of',
  3665. 'KW': 'Kuwait',
  3666. 'KG': 'Kyrgyzstan',
  3667. 'LA': 'Lao People\'s Democratic Republic',
  3668. 'LV': 'Latvia',
  3669. 'LB': 'Lebanon',
  3670. 'LS': 'Lesotho',
  3671. 'LR': 'Liberia',
  3672. 'LY': 'Libya',
  3673. 'LI': 'Liechtenstein',
  3674. 'LT': 'Lithuania',
  3675. 'LU': 'Luxembourg',
  3676. 'MO': 'Macao',
  3677. 'MK': 'Macedonia, the Former Yugoslav Republic of',
  3678. 'MG': 'Madagascar',
  3679. 'MW': 'Malawi',
  3680. 'MY': 'Malaysia',
  3681. 'MV': 'Maldives',
  3682. 'ML': 'Mali',
  3683. 'MT': 'Malta',
  3684. 'MH': 'Marshall Islands',
  3685. 'MQ': 'Martinique',
  3686. 'MR': 'Mauritania',
  3687. 'MU': 'Mauritius',
  3688. 'YT': 'Mayotte',
  3689. 'MX': 'Mexico',
  3690. 'FM': 'Micronesia, Federated States of',
  3691. 'MD': 'Moldova, Republic of',
  3692. 'MC': 'Monaco',
  3693. 'MN': 'Mongolia',
  3694. 'ME': 'Montenegro',
  3695. 'MS': 'Montserrat',
  3696. 'MA': 'Morocco',
  3697. 'MZ': 'Mozambique',
  3698. 'MM': 'Myanmar',
  3699. 'NA': 'Namibia',
  3700. 'NR': 'Nauru',
  3701. 'NP': 'Nepal',
  3702. 'NL': 'Netherlands',
  3703. 'NC': 'New Caledonia',
  3704. 'NZ': 'New Zealand',
  3705. 'NI': 'Nicaragua',
  3706. 'NE': 'Niger',
  3707. 'NG': 'Nigeria',
  3708. 'NU': 'Niue',
  3709. 'NF': 'Norfolk Island',
  3710. 'MP': 'Northern Mariana Islands',
  3711. 'NO': 'Norway',
  3712. 'OM': 'Oman',
  3713. 'PK': 'Pakistan',
  3714. 'PW': 'Palau',
  3715. 'PS': 'Palestine, State of',
  3716. 'PA': 'Panama',
  3717. 'PG': 'Papua New Guinea',
  3718. 'PY': 'Paraguay',
  3719. 'PE': 'Peru',
  3720. 'PH': 'Philippines',
  3721. 'PN': 'Pitcairn',
  3722. 'PL': 'Poland',
  3723. 'PT': 'Portugal',
  3724. 'PR': 'Puerto Rico',
  3725. 'QA': 'Qatar',
  3726. 'RE': 'Réunion',
  3727. 'RO': 'Romania',
  3728. 'RU': 'Russian Federation',
  3729. 'RW': 'Rwanda',
  3730. 'BL': 'Saint Barthélemy',
  3731. 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
  3732. 'KN': 'Saint Kitts and Nevis',
  3733. 'LC': 'Saint Lucia',
  3734. 'MF': 'Saint Martin (French part)',
  3735. 'PM': 'Saint Pierre and Miquelon',
  3736. 'VC': 'Saint Vincent and the Grenadines',
  3737. 'WS': 'Samoa',
  3738. 'SM': 'San Marino',
  3739. 'ST': 'Sao Tome and Principe',
  3740. 'SA': 'Saudi Arabia',
  3741. 'SN': 'Senegal',
  3742. 'RS': 'Serbia',
  3743. 'SC': 'Seychelles',
  3744. 'SL': 'Sierra Leone',
  3745. 'SG': 'Singapore',
  3746. 'SX': 'Sint Maarten (Dutch part)',
  3747. 'SK': 'Slovakia',
  3748. 'SI': 'Slovenia',
  3749. 'SB': 'Solomon Islands',
  3750. 'SO': 'Somalia',
  3751. 'ZA': 'South Africa',
  3752. 'GS': 'South Georgia and the South Sandwich Islands',
  3753. 'SS': 'South Sudan',
  3754. 'ES': 'Spain',
  3755. 'LK': 'Sri Lanka',
  3756. 'SD': 'Sudan',
  3757. 'SR': 'Suriname',
  3758. 'SJ': 'Svalbard and Jan Mayen',
  3759. 'SZ': 'Swaziland',
  3760. 'SE': 'Sweden',
  3761. 'CH': 'Switzerland',
  3762. 'SY': 'Syrian Arab Republic',
  3763. 'TW': 'Taiwan, Province of China',
  3764. 'TJ': 'Tajikistan',
  3765. 'TZ': 'Tanzania, United Republic of',
  3766. 'TH': 'Thailand',
  3767. 'TL': 'Timor-Leste',
  3768. 'TG': 'Togo',
  3769. 'TK': 'Tokelau',
  3770. 'TO': 'Tonga',
  3771. 'TT': 'Trinidad and Tobago',
  3772. 'TN': 'Tunisia',
  3773. 'TR': 'Turkey',
  3774. 'TM': 'Turkmenistan',
  3775. 'TC': 'Turks and Caicos Islands',
  3776. 'TV': 'Tuvalu',
  3777. 'UG': 'Uganda',
  3778. 'UA': 'Ukraine',
  3779. 'AE': 'United Arab Emirates',
  3780. 'GB': 'United Kingdom',
  3781. 'US': 'United States',
  3782. 'UM': 'United States Minor Outlying Islands',
  3783. 'UY': 'Uruguay',
  3784. 'UZ': 'Uzbekistan',
  3785. 'VU': 'Vanuatu',
  3786. 'VE': 'Venezuela, Bolivarian Republic of',
  3787. 'VN': 'Viet Nam',
  3788. 'VG': 'Virgin Islands, British',
  3789. 'VI': 'Virgin Islands, U.S.',
  3790. 'WF': 'Wallis and Futuna',
  3791. 'EH': 'Western Sahara',
  3792. 'YE': 'Yemen',
  3793. 'ZM': 'Zambia',
  3794. 'ZW': 'Zimbabwe',
  3795. # Not ISO 3166 codes, but used for IP blocks
  3796. 'AP': 'Asia/Pacific Region',
  3797. 'EU': 'Europe',
  3798. }
  3799. @classmethod
  3800. def short2full(cls, code):
  3801. """Convert an ISO 3166-2 country code to the corresponding full name"""
  3802. return cls._country_map.get(code.upper())
  3803. class GeoUtils:
  3804. # Major IPv4 address blocks per country
  3805. _country_ip_map = {
  3806. 'AD': '46.172.224.0/19',
  3807. 'AE': '94.200.0.0/13',
  3808. 'AF': '149.54.0.0/17',
  3809. 'AG': '209.59.64.0/18',
  3810. 'AI': '204.14.248.0/21',
  3811. 'AL': '46.99.0.0/16',
  3812. 'AM': '46.70.0.0/15',
  3813. 'AO': '105.168.0.0/13',
  3814. 'AP': '182.50.184.0/21',
  3815. 'AQ': '23.154.160.0/24',
  3816. 'AR': '181.0.0.0/12',
  3817. 'AS': '202.70.112.0/20',
  3818. 'AT': '77.116.0.0/14',
  3819. 'AU': '1.128.0.0/11',
  3820. 'AW': '181.41.0.0/18',
  3821. 'AX': '185.217.4.0/22',
  3822. 'AZ': '5.197.0.0/16',
  3823. 'BA': '31.176.128.0/17',
  3824. 'BB': '65.48.128.0/17',
  3825. 'BD': '114.130.0.0/16',
  3826. 'BE': '57.0.0.0/8',
  3827. 'BF': '102.178.0.0/15',
  3828. 'BG': '95.42.0.0/15',
  3829. 'BH': '37.131.0.0/17',
  3830. 'BI': '154.117.192.0/18',
  3831. 'BJ': '137.255.0.0/16',
  3832. 'BL': '185.212.72.0/23',
  3833. 'BM': '196.12.64.0/18',
  3834. 'BN': '156.31.0.0/16',
  3835. 'BO': '161.56.0.0/16',
  3836. 'BQ': '161.0.80.0/20',
  3837. 'BR': '191.128.0.0/12',
  3838. 'BS': '24.51.64.0/18',
  3839. 'BT': '119.2.96.0/19',
  3840. 'BW': '168.167.0.0/16',
  3841. 'BY': '178.120.0.0/13',
  3842. 'BZ': '179.42.192.0/18',
  3843. 'CA': '99.224.0.0/11',
  3844. 'CD': '41.243.0.0/16',
  3845. 'CF': '197.242.176.0/21',
  3846. 'CG': '160.113.0.0/16',
  3847. 'CH': '85.0.0.0/13',
  3848. 'CI': '102.136.0.0/14',
  3849. 'CK': '202.65.32.0/19',
  3850. 'CL': '152.172.0.0/14',
  3851. 'CM': '102.244.0.0/14',
  3852. 'CN': '36.128.0.0/10',
  3853. 'CO': '181.240.0.0/12',
  3854. 'CR': '201.192.0.0/12',
  3855. 'CU': '152.206.0.0/15',
  3856. 'CV': '165.90.96.0/19',
  3857. 'CW': '190.88.128.0/17',
  3858. 'CY': '31.153.0.0/16',
  3859. 'CZ': '88.100.0.0/14',
  3860. 'DE': '53.0.0.0/8',
  3861. 'DJ': '197.241.0.0/17',
  3862. 'DK': '87.48.0.0/12',
  3863. 'DM': '192.243.48.0/20',
  3864. 'DO': '152.166.0.0/15',
  3865. 'DZ': '41.96.0.0/12',
  3866. 'EC': '186.68.0.0/15',
  3867. 'EE': '90.190.0.0/15',
  3868. 'EG': '156.160.0.0/11',
  3869. 'ER': '196.200.96.0/20',
  3870. 'ES': '88.0.0.0/11',
  3871. 'ET': '196.188.0.0/14',
  3872. 'EU': '2.16.0.0/13',
  3873. 'FI': '91.152.0.0/13',
  3874. 'FJ': '144.120.0.0/16',
  3875. 'FK': '80.73.208.0/21',
  3876. 'FM': '119.252.112.0/20',
  3877. 'FO': '88.85.32.0/19',
  3878. 'FR': '90.0.0.0/9',
  3879. 'GA': '41.158.0.0/15',
  3880. 'GB': '25.0.0.0/8',
  3881. 'GD': '74.122.88.0/21',
  3882. 'GE': '31.146.0.0/16',
  3883. 'GF': '161.22.64.0/18',
  3884. 'GG': '62.68.160.0/19',
  3885. 'GH': '154.160.0.0/12',
  3886. 'GI': '95.164.0.0/16',
  3887. 'GL': '88.83.0.0/19',
  3888. 'GM': '160.182.0.0/15',
  3889. 'GN': '197.149.192.0/18',
  3890. 'GP': '104.250.0.0/19',
  3891. 'GQ': '105.235.224.0/20',
  3892. 'GR': '94.64.0.0/13',
  3893. 'GT': '168.234.0.0/16',
  3894. 'GU': '168.123.0.0/16',
  3895. 'GW': '197.214.80.0/20',
  3896. 'GY': '181.41.64.0/18',
  3897. 'HK': '113.252.0.0/14',
  3898. 'HN': '181.210.0.0/16',
  3899. 'HR': '93.136.0.0/13',
  3900. 'HT': '148.102.128.0/17',
  3901. 'HU': '84.0.0.0/14',
  3902. 'ID': '39.192.0.0/10',
  3903. 'IE': '87.32.0.0/12',
  3904. 'IL': '79.176.0.0/13',
  3905. 'IM': '5.62.80.0/20',
  3906. 'IN': '117.192.0.0/10',
  3907. 'IO': '203.83.48.0/21',
  3908. 'IQ': '37.236.0.0/14',
  3909. 'IR': '2.176.0.0/12',
  3910. 'IS': '82.221.0.0/16',
  3911. 'IT': '79.0.0.0/10',
  3912. 'JE': '87.244.64.0/18',
  3913. 'JM': '72.27.0.0/17',
  3914. 'JO': '176.29.0.0/16',
  3915. 'JP': '133.0.0.0/8',
  3916. 'KE': '105.48.0.0/12',
  3917. 'KG': '158.181.128.0/17',
  3918. 'KH': '36.37.128.0/17',
  3919. 'KI': '103.25.140.0/22',
  3920. 'KM': '197.255.224.0/20',
  3921. 'KN': '198.167.192.0/19',
  3922. 'KP': '175.45.176.0/22',
  3923. 'KR': '175.192.0.0/10',
  3924. 'KW': '37.36.0.0/14',
  3925. 'KY': '64.96.0.0/15',
  3926. 'KZ': '2.72.0.0/13',
  3927. 'LA': '115.84.64.0/18',
  3928. 'LB': '178.135.0.0/16',
  3929. 'LC': '24.92.144.0/20',
  3930. 'LI': '82.117.0.0/19',
  3931. 'LK': '112.134.0.0/15',
  3932. 'LR': '102.183.0.0/16',
  3933. 'LS': '129.232.0.0/17',
  3934. 'LT': '78.56.0.0/13',
  3935. 'LU': '188.42.0.0/16',
  3936. 'LV': '46.109.0.0/16',
  3937. 'LY': '41.252.0.0/14',
  3938. 'MA': '105.128.0.0/11',
  3939. 'MC': '88.209.64.0/18',
  3940. 'MD': '37.246.0.0/16',
  3941. 'ME': '178.175.0.0/17',
  3942. 'MF': '74.112.232.0/21',
  3943. 'MG': '154.126.0.0/17',
  3944. 'MH': '117.103.88.0/21',
  3945. 'MK': '77.28.0.0/15',
  3946. 'ML': '154.118.128.0/18',
  3947. 'MM': '37.111.0.0/17',
  3948. 'MN': '49.0.128.0/17',
  3949. 'MO': '60.246.0.0/16',
  3950. 'MP': '202.88.64.0/20',
  3951. 'MQ': '109.203.224.0/19',
  3952. 'MR': '41.188.64.0/18',
  3953. 'MS': '208.90.112.0/22',
  3954. 'MT': '46.11.0.0/16',
  3955. 'MU': '105.16.0.0/12',
  3956. 'MV': '27.114.128.0/18',
  3957. 'MW': '102.70.0.0/15',
  3958. 'MX': '187.192.0.0/11',
  3959. 'MY': '175.136.0.0/13',
  3960. 'MZ': '197.218.0.0/15',
  3961. 'NA': '41.182.0.0/16',
  3962. 'NC': '101.101.0.0/18',
  3963. 'NE': '197.214.0.0/18',
  3964. 'NF': '203.17.240.0/22',
  3965. 'NG': '105.112.0.0/12',
  3966. 'NI': '186.76.0.0/15',
  3967. 'NL': '145.96.0.0/11',
  3968. 'NO': '84.208.0.0/13',
  3969. 'NP': '36.252.0.0/15',
  3970. 'NR': '203.98.224.0/19',
  3971. 'NU': '49.156.48.0/22',
  3972. 'NZ': '49.224.0.0/14',
  3973. 'OM': '5.36.0.0/15',
  3974. 'PA': '186.72.0.0/15',
  3975. 'PE': '186.160.0.0/14',
  3976. 'PF': '123.50.64.0/18',
  3977. 'PG': '124.240.192.0/19',
  3978. 'PH': '49.144.0.0/13',
  3979. 'PK': '39.32.0.0/11',
  3980. 'PL': '83.0.0.0/11',
  3981. 'PM': '70.36.0.0/20',
  3982. 'PR': '66.50.0.0/16',
  3983. 'PS': '188.161.0.0/16',
  3984. 'PT': '85.240.0.0/13',
  3985. 'PW': '202.124.224.0/20',
  3986. 'PY': '181.120.0.0/14',
  3987. 'QA': '37.210.0.0/15',
  3988. 'RE': '102.35.0.0/16',
  3989. 'RO': '79.112.0.0/13',
  3990. 'RS': '93.86.0.0/15',
  3991. 'RU': '5.136.0.0/13',
  3992. 'RW': '41.186.0.0/16',
  3993. 'SA': '188.48.0.0/13',
  3994. 'SB': '202.1.160.0/19',
  3995. 'SC': '154.192.0.0/11',
  3996. 'SD': '102.120.0.0/13',
  3997. 'SE': '78.64.0.0/12',
  3998. 'SG': '8.128.0.0/10',
  3999. 'SI': '188.196.0.0/14',
  4000. 'SK': '78.98.0.0/15',
  4001. 'SL': '102.143.0.0/17',
  4002. 'SM': '89.186.32.0/19',
  4003. 'SN': '41.82.0.0/15',
  4004. 'SO': '154.115.192.0/18',
  4005. 'SR': '186.179.128.0/17',
  4006. 'SS': '105.235.208.0/21',
  4007. 'ST': '197.159.160.0/19',
  4008. 'SV': '168.243.0.0/16',
  4009. 'SX': '190.102.0.0/20',
  4010. 'SY': '5.0.0.0/16',
  4011. 'SZ': '41.84.224.0/19',
  4012. 'TC': '65.255.48.0/20',
  4013. 'TD': '154.68.128.0/19',
  4014. 'TG': '196.168.0.0/14',
  4015. 'TH': '171.96.0.0/13',
  4016. 'TJ': '85.9.128.0/18',
  4017. 'TK': '27.96.24.0/21',
  4018. 'TL': '180.189.160.0/20',
  4019. 'TM': '95.85.96.0/19',
  4020. 'TN': '197.0.0.0/11',
  4021. 'TO': '175.176.144.0/21',
  4022. 'TR': '78.160.0.0/11',
  4023. 'TT': '186.44.0.0/15',
  4024. 'TV': '202.2.96.0/19',
  4025. 'TW': '120.96.0.0/11',
  4026. 'TZ': '156.156.0.0/14',
  4027. 'UA': '37.52.0.0/14',
  4028. 'UG': '102.80.0.0/13',
  4029. 'US': '6.0.0.0/8',
  4030. 'UY': '167.56.0.0/13',
  4031. 'UZ': '84.54.64.0/18',
  4032. 'VA': '212.77.0.0/19',
  4033. 'VC': '207.191.240.0/21',
  4034. 'VE': '186.88.0.0/13',
  4035. 'VG': '66.81.192.0/20',
  4036. 'VI': '146.226.0.0/16',
  4037. 'VN': '14.160.0.0/11',
  4038. 'VU': '202.80.32.0/20',
  4039. 'WF': '117.20.32.0/21',
  4040. 'WS': '202.4.32.0/19',
  4041. 'YE': '134.35.0.0/16',
  4042. 'YT': '41.242.116.0/22',
  4043. 'ZA': '41.0.0.0/11',
  4044. 'ZM': '102.144.0.0/13',
  4045. 'ZW': '102.177.192.0/18',
  4046. }
  4047. @classmethod
  4048. def random_ipv4(cls, code_or_block):
  4049. if len(code_or_block) == 2:
  4050. block = cls._country_ip_map.get(code_or_block.upper())
  4051. if not block:
  4052. return None
  4053. else:
  4054. block = code_or_block
  4055. addr, preflen = block.split('/')
  4056. addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
  4057. addr_max = addr_min | (0xffffffff >> int(preflen))
  4058. return str(socket.inet_ntoa(
  4059. struct.pack('!L', random.randint(addr_min, addr_max))))
  4060. class PerRequestProxyHandler(urllib.request.ProxyHandler):
  4061. def __init__(self, proxies=None):
  4062. # Set default handlers
  4063. for type in ('http', 'https'):
  4064. setattr(self, '%s_open' % type,
  4065. lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
  4066. meth(r, proxy, type))
  4067. urllib.request.ProxyHandler.__init__(self, proxies)
  4068. def proxy_open(self, req, proxy, type):
  4069. req_proxy = req.headers.get('Ytdl-request-proxy')
  4070. if req_proxy is not None:
  4071. proxy = req_proxy
  4072. del req.headers['Ytdl-request-proxy']
  4073. if proxy == '__noproxy__':
  4074. return None # No Proxy
  4075. if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
  4076. req.add_header('Ytdl-socks-proxy', proxy)
  4077. # hypervideo's http/https handlers do wrapping the socket with socks
  4078. return None
  4079. return urllib.request.ProxyHandler.proxy_open(
  4080. self, req, proxy, type)
  4081. # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
  4082. # released into Public Domain
  4083. # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
  4084. def long_to_bytes(n, blocksize=0):
  4085. """long_to_bytes(n:long, blocksize:int) : string
  4086. Convert a long integer to a byte string.
  4087. If optional blocksize is given and greater than zero, pad the front of the
  4088. byte string with binary zeros so that the length is a multiple of
  4089. blocksize.
  4090. """
  4091. # after much testing, this algorithm was deemed to be the fastest
  4092. s = b''
  4093. n = int(n)
  4094. while n > 0:
  4095. s = struct.pack('>I', n & 0xffffffff) + s
  4096. n = n >> 32
  4097. # strip off leading zeros
  4098. for i in range(len(s)):
  4099. if s[i] != b'\000'[0]:
  4100. break
  4101. else:
  4102. # only happens when n == 0
  4103. s = b'\000'
  4104. i = 0
  4105. s = s[i:]
  4106. # add back some pad bytes. this could be done more efficiently w.r.t. the
  4107. # de-padding being done above, but sigh...
  4108. if blocksize > 0 and len(s) % blocksize:
  4109. s = (blocksize - len(s) % blocksize) * b'\000' + s
  4110. return s
  4111. def bytes_to_long(s):
  4112. """bytes_to_long(string) : long
  4113. Convert a byte string to a long integer.
  4114. This is (essentially) the inverse of long_to_bytes().
  4115. """
  4116. acc = 0
  4117. length = len(s)
  4118. if length % 4:
  4119. extra = (4 - length % 4)
  4120. s = b'\000' * extra + s
  4121. length = length + extra
  4122. for i in range(0, length, 4):
  4123. acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
  4124. return acc
  4125. def ohdave_rsa_encrypt(data, exponent, modulus):
  4126. '''
  4127. Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
  4128. Input:
  4129. data: data to encrypt, bytes-like object
  4130. exponent, modulus: parameter e and N of RSA algorithm, both integer
  4131. Output: hex string of encrypted data
  4132. Limitation: supports one block encryption only
  4133. '''
  4134. payload = int(binascii.hexlify(data[::-1]), 16)
  4135. encrypted = pow(payload, exponent, modulus)
  4136. return '%x' % encrypted
  4137. def pkcs1pad(data, length):
  4138. """
  4139. Padding input data with PKCS#1 scheme
  4140. @param {int[]} data input data
  4141. @param {int} length target length
  4142. @returns {int[]} padded data
  4143. """
  4144. if len(data) > length - 11:
  4145. raise ValueError('Input data too long for PKCS#1 padding')
  4146. pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
  4147. return [0, 2] + pseudo_random + [0] + data
  4148. def _base_n_table(n, table):
  4149. if not table and not n:
  4150. raise ValueError('Either table or n must be specified')
  4151. table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
  4152. if n and n != len(table):
  4153. raise ValueError(f'base {n} exceeds table length {len(table)}')
  4154. return table
  4155. def encode_base_n(num, n=None, table=None):
  4156. """Convert given int to a base-n string"""
  4157. table = _base_n_table(n, table)
  4158. if not num:
  4159. return table[0]
  4160. result, base = '', len(table)
  4161. while num:
  4162. result = table[num % base] + result
  4163. num = num // base
  4164. return result
  4165. def decode_base_n(string, n=None, table=None):
  4166. """Convert given base-n string to int"""
  4167. table = {char: index for index, char in enumerate(_base_n_table(n, table))}
  4168. result, base = 0, len(table)
  4169. for char in string:
  4170. result = result * base + table[char]
  4171. return result
  4172. def decode_base(value, digits):
  4173. deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
  4174. f'in a future version. Use {__name__}.decode_base_n instead')
  4175. return decode_base_n(value, table=digits)
  4176. def decode_packed_codes(code):
  4177. mobj = re.search(PACKED_CODES_RE, code)
  4178. obfuscated_code, base, count, symbols = mobj.groups()
  4179. base = int(base)
  4180. count = int(count)
  4181. symbols = symbols.split('|')
  4182. symbol_table = {}
  4183. while count:
  4184. count -= 1
  4185. base_n_count = encode_base_n(count, base)
  4186. symbol_table[base_n_count] = symbols[count] or base_n_count
  4187. return re.sub(
  4188. r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
  4189. obfuscated_code)
  4190. def caesar(s, alphabet, shift):
  4191. if shift == 0:
  4192. return s
  4193. l = len(alphabet)
  4194. return ''.join(
  4195. alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
  4196. for c in s)
  4197. def rot47(s):
  4198. return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
  4199. def parse_m3u8_attributes(attrib):
  4200. info = {}
  4201. for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
  4202. if val.startswith('"'):
  4203. val = val[1:-1]
  4204. info[key] = val
  4205. return info
  4206. def urshift(val, n):
  4207. return val >> n if val >= 0 else (val + 0x100000000) >> n
  4208. # Based on png2str() written by @gdkchan and improved by @yokrysty
  4209. # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
  4210. def decode_png(png_data):
  4211. # Reference: https://www.w3.org/TR/PNG/
  4212. header = png_data[8:]
  4213. if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
  4214. raise OSError('Not a valid PNG file.')
  4215. int_map = {1: '>B', 2: '>H', 4: '>I'}
  4216. unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
  4217. chunks = []
  4218. while header:
  4219. length = unpack_integer(header[:4])
  4220. header = header[4:]
  4221. chunk_type = header[:4]
  4222. header = header[4:]
  4223. chunk_data = header[:length]
  4224. header = header[length:]
  4225. header = header[4:] # Skip CRC
  4226. chunks.append({
  4227. 'type': chunk_type,
  4228. 'length': length,
  4229. 'data': chunk_data
  4230. })
  4231. ihdr = chunks[0]['data']
  4232. width = unpack_integer(ihdr[:4])
  4233. height = unpack_integer(ihdr[4:8])
  4234. idat = b''
  4235. for chunk in chunks:
  4236. if chunk['type'] == b'IDAT':
  4237. idat += chunk['data']
  4238. if not idat:
  4239. raise OSError('Unable to read PNG data.')
  4240. decompressed_data = bytearray(zlib.decompress(idat))
  4241. stride = width * 3
  4242. pixels = []
  4243. def _get_pixel(idx):
  4244. x = idx % stride
  4245. y = idx // stride
  4246. return pixels[y][x]
  4247. for y in range(height):
  4248. basePos = y * (1 + stride)
  4249. filter_type = decompressed_data[basePos]
  4250. current_row = []
  4251. pixels.append(current_row)
  4252. for x in range(stride):
  4253. color = decompressed_data[1 + basePos + x]
  4254. basex = y * stride + x
  4255. left = 0
  4256. up = 0
  4257. if x > 2:
  4258. left = _get_pixel(basex - 3)
  4259. if y > 0:
  4260. up = _get_pixel(basex - stride)
  4261. if filter_type == 1: # Sub
  4262. color = (color + left) & 0xff
  4263. elif filter_type == 2: # Up
  4264. color = (color + up) & 0xff
  4265. elif filter_type == 3: # Average
  4266. color = (color + ((left + up) >> 1)) & 0xff
  4267. elif filter_type == 4: # Paeth
  4268. a = left
  4269. b = up
  4270. c = 0
  4271. if x > 2 and y > 0:
  4272. c = _get_pixel(basex - stride - 3)
  4273. p = a + b - c
  4274. pa = abs(p - a)
  4275. pb = abs(p - b)
  4276. pc = abs(p - c)
  4277. if pa <= pb and pa <= pc:
  4278. color = (color + a) & 0xff
  4279. elif pb <= pc:
  4280. color = (color + b) & 0xff
  4281. else:
  4282. color = (color + c) & 0xff
  4283. current_row.append(color)
  4284. return width, height, pixels
  4285. def write_xattr(path, key, value):
  4286. # Windows: Write xattrs to NTFS Alternate Data Streams:
  4287. # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
  4288. if compat_os_name == 'nt':
  4289. assert ':' not in key
  4290. assert os.path.exists(path)
  4291. try:
  4292. with open(f'{path}:{key}', 'wb') as f:
  4293. f.write(value)
  4294. except OSError as e:
  4295. raise XAttrMetadataError(e.errno, e.strerror)
  4296. return
  4297. # UNIX Method 1. Use xattrs/pyxattrs modules
  4298. setxattr = None
  4299. if getattr(xattr, '_hypervideo_dl__identifier', None) == 'pyxattr':
  4300. # Unicode arguments are not supported in pyxattr until version 0.5.0
  4301. # See https://github.com/ytdl-org/youtube-dl/issues/5498
  4302. if version_tuple(xattr.__version__) >= (0, 5, 0):
  4303. setxattr = xattr.set
  4304. elif xattr:
  4305. setxattr = xattr.setxattr
  4306. if setxattr:
  4307. try:
  4308. setxattr(path, key, value)
  4309. except OSError as e:
  4310. raise XAttrMetadataError(e.errno, e.strerror)
  4311. return
  4312. # UNIX Method 2. Use setfattr/xattr executables
  4313. exe = ('setfattr' if check_executable('setfattr', ['--version'])
  4314. else 'xattr' if check_executable('xattr', ['-h']) else None)
  4315. if not exe:
  4316. raise XAttrUnavailableError(
  4317. 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
  4318. + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
  4319. value = value.decode()
  4320. try:
  4321. _, stderr, returncode = Popen.run(
  4322. [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
  4323. text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
  4324. except OSError as e:
  4325. raise XAttrMetadataError(e.errno, e.strerror)
  4326. if returncode:
  4327. raise XAttrMetadataError(returncode, stderr)
  4328. def random_birthday(year_field, month_field, day_field):
  4329. start_date = datetime.date(1950, 1, 1)
  4330. end_date = datetime.date(1995, 12, 31)
  4331. offset = random.randint(0, (end_date - start_date).days)
  4332. random_date = start_date + datetime.timedelta(offset)
  4333. return {
  4334. year_field: str(random_date.year),
  4335. month_field: str(random_date.month),
  4336. day_field: str(random_date.day),
  4337. }
  4338. # Templates for internet shortcut files, which are plain text files.
  4339. DOT_URL_LINK_TEMPLATE = '''\
  4340. [InternetShortcut]
  4341. URL=%(url)s
  4342. '''
  4343. DOT_WEBLOC_LINK_TEMPLATE = '''\
  4344. <?xml version="1.0" encoding="UTF-8"?>
  4345. <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
  4346. <plist version="1.0">
  4347. <dict>
  4348. \t<key>URL</key>
  4349. \t<string>%(url)s</string>
  4350. </dict>
  4351. </plist>
  4352. '''
  4353. DOT_DESKTOP_LINK_TEMPLATE = '''\
  4354. [Desktop Entry]
  4355. Encoding=UTF-8
  4356. Name=%(filename)s
  4357. Type=Link
  4358. URL=%(url)s
  4359. Icon=text-html
  4360. '''
  4361. LINK_TEMPLATES = {
  4362. 'url': DOT_URL_LINK_TEMPLATE,
  4363. 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
  4364. 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
  4365. }
  4366. def iri_to_uri(iri):
  4367. """
  4368. Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
  4369. The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
  4370. """
  4371. iri_parts = urllib.parse.urlparse(iri)
  4372. if '[' in iri_parts.netloc:
  4373. raise ValueError('IPv6 URIs are not, yet, supported.')
  4374. # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
  4375. # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
  4376. net_location = ''
  4377. if iri_parts.username:
  4378. net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
  4379. if iri_parts.password is not None:
  4380. net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
  4381. net_location += '@'
  4382. net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
  4383. # The 'idna' encoding produces ASCII text.
  4384. if iri_parts.port is not None and iri_parts.port != 80:
  4385. net_location += ':' + str(iri_parts.port)
  4386. return urllib.parse.urlunparse(
  4387. (iri_parts.scheme,
  4388. net_location,
  4389. urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
  4390. # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
  4391. urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
  4392. # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
  4393. urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
  4394. urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
  4395. # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
  4396. def to_high_limit_path(path):
  4397. if sys.platform in ['win32', 'cygwin']:
  4398. # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
  4399. return '\\\\?\\' + os.path.abspath(path)
  4400. return path
  4401. def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
  4402. val = traverse_obj(obj, *variadic(field))
  4403. if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
  4404. return default
  4405. return template % func(val)
  4406. def clean_podcast_url(url):
  4407. return re.sub(r'''(?x)
  4408. (?:
  4409. (?:
  4410. chtbl\.com/track|
  4411. media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
  4412. play\.podtrac\.com
  4413. )/[^/]+|
  4414. (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
  4415. flex\.acast\.com|
  4416. pd(?:
  4417. cn\.co| # https://podcorn.com/analytics-prefix/
  4418. st\.fm # https://podsights.com/docs/
  4419. )/e
  4420. )/''', '', url)
  4421. _HEX_TABLE = '0123456789abcdef'
  4422. def random_uuidv4():
  4423. return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
  4424. def make_dir(path, to_screen=None):
  4425. try:
  4426. dn = os.path.dirname(path)
  4427. if dn and not os.path.exists(dn):
  4428. os.makedirs(dn)
  4429. return True
  4430. except OSError as err:
  4431. if callable(to_screen) is not None:
  4432. to_screen('unable to create directory ' + error_to_compat_str(err))
  4433. return False
  4434. def get_executable_path():
  4435. from zipimport import zipimporter
  4436. if hasattr(sys, 'frozen'): # Running from PyInstaller
  4437. path = os.path.dirname(sys.executable)
  4438. elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
  4439. path = os.path.join(os.path.dirname(__file__), '../..')
  4440. else:
  4441. path = os.path.join(os.path.dirname(__file__), '..')
  4442. return os.path.abspath(path)
  4443. def load_plugins(name, suffix, namespace):
  4444. classes = {}
  4445. with contextlib.suppress(FileNotFoundError):
  4446. plugins_spec = importlib.util.spec_from_file_location(
  4447. name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
  4448. plugins = importlib.util.module_from_spec(plugins_spec)
  4449. sys.modules[plugins_spec.name] = plugins
  4450. plugins_spec.loader.exec_module(plugins)
  4451. for name in dir(plugins):
  4452. if name in namespace:
  4453. continue
  4454. if not name.endswith(suffix):
  4455. continue
  4456. klass = getattr(plugins, name)
  4457. classes[name] = namespace[name] = klass
  4458. return classes
  4459. def traverse_obj(
  4460. obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
  4461. casesense=True, is_user_input=False, traverse_string=False):
  4462. """
  4463. Safely traverse nested `dict`s and `Sequence`s
  4464. >>> obj = [{}, {"key": "value"}]
  4465. >>> traverse_obj(obj, (1, "key"))
  4466. "value"
  4467. Each of the provided `paths` is tested and the first producing a valid result will be returned.
  4468. The next path will also be tested if the path branched but no results could be found.
  4469. Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
  4470. A value of None is treated as the absence of a value.
  4471. The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
  4472. The keys in the path can be one of:
  4473. - `None`: Return the current object.
  4474. - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`.
  4475. - `slice`: Branch out and return all values in `obj[key]`.
  4476. - `Ellipsis`: Branch out and return a list of all values.
  4477. - `tuple`/`list`: Branch out and return a list of all matching values.
  4478. Read as: `[traverse_obj(obj, branch) for branch in branches]`.
  4479. - `function`: Branch out and return values filtered by the function.
  4480. Read as: `[value for key, value in obj if function(key, value)]`.
  4481. For `Sequence`s, `key` is the index of the value.
  4482. - `dict` Transform the current object and return a matching dict.
  4483. Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
  4484. `tuple`, `list`, and `dict` all support nested paths and branches.
  4485. @params paths Paths which to traverse by.
  4486. @param default Value to return if the paths do not match.
  4487. @param expected_type If a `type`, only accept final values of this type.
  4488. If any other callable, try to call the function on each result.
  4489. @param get_all If `False`, return the first matching result, otherwise all matching ones.
  4490. @param casesense If `False`, consider string dictionary keys as case insensitive.
  4491. The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
  4492. @param is_user_input Whether the keys are generated from user input.
  4493. If `True` strings get converted to `int`/`slice` if needed.
  4494. @param traverse_string Whether to traverse into objects as strings.
  4495. If `True`, any non-compatible object will first be
  4496. converted into a string and then traversed into.
  4497. @returns The result of the object traversal.
  4498. If successful, `get_all=True`, and the path branches at least once,
  4499. then a list of results is returned instead.
  4500. A list is always returned if the last path branches and no `default` is given.
  4501. """
  4502. is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
  4503. casefold = lambda k: k.casefold() if isinstance(k, str) else k
  4504. if isinstance(expected_type, type):
  4505. type_test = lambda val: val if isinstance(val, expected_type) else None
  4506. else:
  4507. type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
  4508. def apply_key(key, obj):
  4509. if obj is None:
  4510. return
  4511. elif key is None:
  4512. yield obj
  4513. elif isinstance(key, (list, tuple)):
  4514. for branch in key:
  4515. _, result = apply_path(obj, branch)
  4516. yield from result
  4517. elif key is ...:
  4518. if isinstance(obj, collections.abc.Mapping):
  4519. yield from obj.values()
  4520. elif is_sequence(obj):
  4521. yield from obj
  4522. elif isinstance(obj, re.Match):
  4523. yield from obj.groups()
  4524. elif traverse_string:
  4525. yield from str(obj)
  4526. elif callable(key):
  4527. if is_sequence(obj):
  4528. iter_obj = enumerate(obj)
  4529. elif isinstance(obj, collections.abc.Mapping):
  4530. iter_obj = obj.items()
  4531. elif isinstance(obj, re.Match):
  4532. iter_obj = enumerate((obj.group(), *obj.groups()))
  4533. elif traverse_string:
  4534. iter_obj = enumerate(str(obj))
  4535. else:
  4536. return
  4537. yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
  4538. elif isinstance(key, dict):
  4539. iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
  4540. yield {k: v if v is not None else default for k, v in iter_obj
  4541. if v is not None or default is not NO_DEFAULT}
  4542. elif isinstance(obj, collections.abc.Mapping):
  4543. yield (obj.get(key) if casesense or (key in obj)
  4544. else next((v for k, v in obj.items() if casefold(k) == key), None))
  4545. elif isinstance(obj, re.Match):
  4546. if isinstance(key, int) or casesense:
  4547. with contextlib.suppress(IndexError):
  4548. yield obj.group(key)
  4549. return
  4550. if not isinstance(key, str):
  4551. return
  4552. yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
  4553. else:
  4554. if is_user_input:
  4555. key = (int_or_none(key) if ':' not in key
  4556. else slice(*map(int_or_none, key.split(':'))))
  4557. if not isinstance(key, (int, slice)):
  4558. return
  4559. if not is_sequence(obj):
  4560. if not traverse_string:
  4561. return
  4562. obj = str(obj)
  4563. with contextlib.suppress(IndexError):
  4564. yield obj[key]
  4565. def apply_path(start_obj, path):
  4566. objs = (start_obj,)
  4567. has_branched = False
  4568. for key in variadic(path):
  4569. if is_user_input and key == ':':
  4570. key = ...
  4571. if not casesense and isinstance(key, str):
  4572. key = key.casefold()
  4573. if key is ... or isinstance(key, (list, tuple)) or callable(key):
  4574. has_branched = True
  4575. key_func = functools.partial(apply_key, key)
  4576. objs = itertools.chain.from_iterable(map(key_func, objs))
  4577. return has_branched, objs
  4578. def _traverse_obj(obj, path, use_list=True):
  4579. has_branched, results = apply_path(obj, path)
  4580. results = LazyList(x for x in map(type_test, results) if x is not None)
  4581. if get_all and has_branched:
  4582. return results.exhaust() if results or use_list else None
  4583. return results[0] if results else None
  4584. for index, path in enumerate(paths, 1):
  4585. use_list = default is NO_DEFAULT and index == len(paths)
  4586. result = _traverse_obj(obj, path, use_list)
  4587. if result is not None:
  4588. return result
  4589. return None if default is NO_DEFAULT else default
  4590. def traverse_dict(dictn, keys, casesense=True):
  4591. deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
  4592. f'in a future version. Use "{__name__}.traverse_obj" instead')
  4593. return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
  4594. def get_first(obj, keys, **kwargs):
  4595. return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
  4596. def time_seconds(**kwargs):
  4597. t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
  4598. return t.timestamp()
  4599. # create a JSON Web Signature (jws) with HS256 algorithm
  4600. # the resulting format is in JWS Compact Serialization
  4601. # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
  4602. # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
  4603. def jwt_encode_hs256(payload_data, key, headers={}):
  4604. header_data = {
  4605. 'alg': 'HS256',
  4606. 'typ': 'JWT',
  4607. }
  4608. if headers:
  4609. header_data.update(headers)
  4610. header_b64 = base64.b64encode(json.dumps(header_data).encode())
  4611. payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
  4612. h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
  4613. signature_b64 = base64.b64encode(h.digest())
  4614. token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
  4615. return token
  4616. # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
  4617. def jwt_decode_hs256(jwt):
  4618. header_b64, payload_b64, signature_b64 = jwt.split('.')
  4619. # add trailing ='s that may have been stripped, superfluous ='s are ignored
  4620. payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
  4621. return payload_data
  4622. WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
  4623. @functools.cache
  4624. def supports_terminal_sequences(stream):
  4625. if compat_os_name == 'nt':
  4626. if not WINDOWS_VT_MODE:
  4627. return False
  4628. elif not os.getenv('TERM'):
  4629. return False
  4630. try:
  4631. return stream.isatty()
  4632. except BaseException:
  4633. return False
  4634. def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
  4635. if get_windows_version() < (10, 0, 10586):
  4636. return
  4637. global WINDOWS_VT_MODE
  4638. try:
  4639. Popen.run('', shell=True)
  4640. except Exception:
  4641. return
  4642. WINDOWS_VT_MODE = True
  4643. supports_terminal_sequences.cache_clear()
  4644. _terminal_sequences_re = re.compile('\033\\[[^m]+m')
  4645. def remove_terminal_sequences(string):
  4646. return _terminal_sequences_re.sub('', string)
  4647. def number_of_digits(number):
  4648. return len('%d' % number)
  4649. def join_nonempty(*values, delim='-', from_dict=None):
  4650. if from_dict is not None:
  4651. values = (traverse_obj(from_dict, variadic(v)) for v in values)
  4652. return delim.join(map(str, filter(None, values)))
  4653. def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
  4654. """
  4655. Find the largest format dimensions in terms of video width and, for each thumbnail:
  4656. * Modify the URL: Match the width with the provided regex and replace with the former width
  4657. * Update dimensions
  4658. This function is useful with video services that scale the provided thumbnails on demand
  4659. """
  4660. _keys = ('width', 'height')
  4661. max_dimensions = max(
  4662. (tuple(format.get(k) or 0 for k in _keys) for format in formats),
  4663. default=(0, 0))
  4664. if not max_dimensions[0]:
  4665. return thumbnails
  4666. return [
  4667. merge_dicts(
  4668. {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
  4669. dict(zip(_keys, max_dimensions)), thumbnail)
  4670. for thumbnail in thumbnails
  4671. ]
  4672. def parse_http_range(range):
  4673. """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
  4674. if not range:
  4675. return None, None, None
  4676. crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
  4677. if not crg:
  4678. return None, None, None
  4679. return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
  4680. def read_stdin(what):
  4681. eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
  4682. write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
  4683. return sys.stdin
  4684. def determine_file_encoding(data):
  4685. """
  4686. Detect the text encoding used
  4687. @returns (encoding, bytes to skip)
  4688. """
  4689. # BOM marks are given priority over declarations
  4690. for bom, enc in BOMS:
  4691. if data.startswith(bom):
  4692. return enc, len(bom)
  4693. # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
  4694. # We ignore the endianness to get a good enough match
  4695. data = data.replace(b'\0', b'')
  4696. mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
  4697. return mobj.group(1).decode() if mobj else None, 0
  4698. class Config:
  4699. own_args = None
  4700. parsed_args = None
  4701. filename = None
  4702. __initialized = False
  4703. def __init__(self, parser, label=None):
  4704. self.parser, self.label = parser, label
  4705. self._loaded_paths, self.configs = set(), []
  4706. def init(self, args=None, filename=None):
  4707. assert not self.__initialized
  4708. self.own_args, self.filename = args, filename
  4709. return self.load_configs()
  4710. def load_configs(self):
  4711. directory = ''
  4712. if self.filename:
  4713. location = os.path.realpath(self.filename)
  4714. directory = os.path.dirname(location)
  4715. if location in self._loaded_paths:
  4716. return False
  4717. self._loaded_paths.add(location)
  4718. self.__initialized = True
  4719. opts, _ = self.parser.parse_known_args(self.own_args)
  4720. self.parsed_args = self.own_args
  4721. for location in opts.config_locations or []:
  4722. if location == '-':
  4723. if location in self._loaded_paths:
  4724. continue
  4725. self._loaded_paths.add(location)
  4726. self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
  4727. continue
  4728. location = os.path.join(directory, expand_path(location))
  4729. if os.path.isdir(location):
  4730. location = os.path.join(location, 'hypervideo.conf')
  4731. if not os.path.exists(location):
  4732. self.parser.error(f'config location {location} does not exist')
  4733. self.append_config(self.read_file(location), location)
  4734. return True
  4735. def __str__(self):
  4736. label = join_nonempty(
  4737. self.label, 'config', f'"{self.filename}"' if self.filename else '',
  4738. delim=' ')
  4739. return join_nonempty(
  4740. self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
  4741. *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
  4742. delim='\n')
  4743. @staticmethod
  4744. def read_file(filename, default=[]):
  4745. try:
  4746. optionf = open(filename, 'rb')
  4747. except OSError:
  4748. return default # silently skip if file is not present
  4749. try:
  4750. enc, skip = determine_file_encoding(optionf.read(512))
  4751. optionf.seek(skip, io.SEEK_SET)
  4752. except OSError:
  4753. enc = None # silently skip read errors
  4754. try:
  4755. # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
  4756. contents = optionf.read().decode(enc or preferredencoding())
  4757. res = shlex.split(contents, comments=True)
  4758. except Exception as err:
  4759. raise ValueError(f'Unable to parse "{filename}": {err}')
  4760. finally:
  4761. optionf.close()
  4762. return res
  4763. @staticmethod
  4764. def hide_login_info(opts):
  4765. PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
  4766. eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
  4767. def _scrub_eq(o):
  4768. m = eqre.match(o)
  4769. if m:
  4770. return m.group('key') + '=PRIVATE'
  4771. else:
  4772. return o
  4773. opts = list(map(_scrub_eq, opts))
  4774. for idx, opt in enumerate(opts):
  4775. if opt in PRIVATE_OPTS and idx + 1 < len(opts):
  4776. opts[idx + 1] = 'PRIVATE'
  4777. return opts
  4778. def append_config(self, *args, label=None):
  4779. config = type(self)(self.parser, label)
  4780. config._loaded_paths = self._loaded_paths
  4781. if config.init(*args):
  4782. self.configs.append(config)
  4783. @property
  4784. def all_args(self):
  4785. for config in reversed(self.configs):
  4786. yield from config.all_args
  4787. yield from self.parsed_args or []
  4788. def parse_known_args(self, **kwargs):
  4789. return self.parser.parse_known_args(self.all_args, **kwargs)
  4790. def parse_args(self):
  4791. return self.parser.parse_args(self.all_args)
  4792. class WebSocketsWrapper:
  4793. """Wraps websockets module to use in non-async scopes"""
  4794. pool = None
  4795. def __init__(self, url, headers=None, connect=True):
  4796. self.loop = asyncio.new_event_loop()
  4797. # XXX: "loop" is deprecated
  4798. self.conn = websockets.connect(
  4799. url, extra_headers=headers, ping_interval=None,
  4800. close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
  4801. if connect:
  4802. self.__enter__()
  4803. atexit.register(self.__exit__, None, None, None)
  4804. def __enter__(self):
  4805. if not self.pool:
  4806. self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
  4807. return self
  4808. def send(self, *args):
  4809. self.run_with_loop(self.pool.send(*args), self.loop)
  4810. def recv(self, *args):
  4811. return self.run_with_loop(self.pool.recv(*args), self.loop)
  4812. def __exit__(self, type, value, traceback):
  4813. try:
  4814. return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
  4815. finally:
  4816. self.loop.close()
  4817. self._cancel_all_tasks(self.loop)
  4818. # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
  4819. # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
  4820. @staticmethod
  4821. def run_with_loop(main, loop):
  4822. if not asyncio.iscoroutine(main):
  4823. raise ValueError(f'a coroutine was expected, got {main!r}')
  4824. try:
  4825. return loop.run_until_complete(main)
  4826. finally:
  4827. loop.run_until_complete(loop.shutdown_asyncgens())
  4828. if hasattr(loop, 'shutdown_default_executor'):
  4829. loop.run_until_complete(loop.shutdown_default_executor())
  4830. @staticmethod
  4831. def _cancel_all_tasks(loop):
  4832. to_cancel = asyncio.all_tasks(loop)
  4833. if not to_cancel:
  4834. return
  4835. for task in to_cancel:
  4836. task.cancel()
  4837. # XXX: "loop" is removed in python 3.10+
  4838. loop.run_until_complete(
  4839. asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
  4840. for task in to_cancel:
  4841. if task.cancelled():
  4842. continue
  4843. if task.exception() is not None:
  4844. loop.call_exception_handler({
  4845. 'message': 'unhandled exception during asyncio.run() shutdown',
  4846. 'exception': task.exception(),
  4847. 'task': task,
  4848. })
  4849. def merge_headers(*dicts):
  4850. """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
  4851. return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
  4852. def cached_method(f):
  4853. """Cache a method"""
  4854. signature = inspect.signature(f)
  4855. @functools.wraps(f)
  4856. def wrapper(self, *args, **kwargs):
  4857. bound_args = signature.bind(self, *args, **kwargs)
  4858. bound_args.apply_defaults()
  4859. key = tuple(bound_args.arguments.values())[1:]
  4860. cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
  4861. if key not in cache:
  4862. cache[key] = f(self, *args, **kwargs)
  4863. return cache[key]
  4864. return wrapper
  4865. class classproperty:
  4866. """property access for class methods with optional caching"""
  4867. def __new__(cls, func=None, *args, **kwargs):
  4868. if not func:
  4869. return functools.partial(cls, *args, **kwargs)
  4870. return super().__new__(cls)
  4871. def __init__(self, func, *, cache=False):
  4872. functools.update_wrapper(self, func)
  4873. self.func = func
  4874. self._cache = {} if cache else None
  4875. def __get__(self, _, cls):
  4876. if self._cache is None:
  4877. return self.func(cls)
  4878. elif cls not in self._cache:
  4879. self._cache[cls] = self.func(cls)
  4880. return self._cache[cls]
  4881. class Namespace(types.SimpleNamespace):
  4882. """Immutable namespace"""
  4883. def __iter__(self):
  4884. return iter(self.__dict__.values())
  4885. @property
  4886. def items_(self):
  4887. return self.__dict__.items()
  4888. MEDIA_EXTENSIONS = Namespace(
  4889. common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
  4890. video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
  4891. common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
  4892. audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
  4893. thumbnails=('jpg', 'png', 'webp'),
  4894. storyboards=('mhtml', ),
  4895. subtitles=('srt', 'vtt', 'ass', 'lrc'),
  4896. manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
  4897. )
  4898. MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
  4899. MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
  4900. KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
  4901. class RetryManager:
  4902. """Usage:
  4903. for retry in RetryManager(...):
  4904. try:
  4905. ...
  4906. except SomeException as err:
  4907. retry.error = err
  4908. continue
  4909. """
  4910. attempt, _error = 0, None
  4911. def __init__(self, _retries, _error_callback, **kwargs):
  4912. self.retries = _retries or 0
  4913. self.error_callback = functools.partial(_error_callback, **kwargs)
  4914. def _should_retry(self):
  4915. return self._error is not NO_DEFAULT and self.attempt <= self.retries
  4916. @property
  4917. def error(self):
  4918. if self._error is NO_DEFAULT:
  4919. return None
  4920. return self._error
  4921. @error.setter
  4922. def error(self, value):
  4923. self._error = value
  4924. def __iter__(self):
  4925. while self._should_retry():
  4926. self.error = NO_DEFAULT
  4927. self.attempt += 1
  4928. yield self
  4929. if self.error:
  4930. self.error_callback(self.error, self.attempt, self.retries)
  4931. @staticmethod
  4932. def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
  4933. """Utility function for reporting retries"""
  4934. if count > retries:
  4935. if error:
  4936. return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
  4937. raise e
  4938. if not count:
  4939. return warn(e)
  4940. elif isinstance(e, ExtractorError):
  4941. e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
  4942. warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
  4943. delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
  4944. if delay:
  4945. info(f'Sleeping {delay:.2f} seconds ...')
  4946. time.sleep(delay)
  4947. def make_archive_id(ie, video_id):
  4948. ie_key = ie if isinstance(ie, str) else ie.ie_key()
  4949. return f'{ie_key.lower()} {video_id}'
  4950. def truncate_string(s, left, right=0):
  4951. assert left > 3 and right >= 0
  4952. if s is None or len(s) <= left + right:
  4953. return s
  4954. return f'{s[:left-3]}...{s[-right:]}'
  4955. def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
  4956. assert 'all' in alias_dict, '"all" alias is required'
  4957. requested = list(start or [])
  4958. for val in options:
  4959. discard = val.startswith('-')
  4960. if discard:
  4961. val = val[1:]
  4962. if val in alias_dict:
  4963. val = alias_dict[val] if not discard else [
  4964. i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
  4965. # NB: Do not allow regex in aliases for performance
  4966. requested = orderedSet_from_options(val, alias_dict, start=requested)
  4967. continue
  4968. current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
  4969. else [val] if val in alias_dict['all'] else None)
  4970. if current is None:
  4971. raise ValueError(val)
  4972. if discard:
  4973. for item in current:
  4974. while item in requested:
  4975. requested.remove(item)
  4976. else:
  4977. requested.extend(current)
  4978. return orderedSet(requested)
  4979. class FormatSorter:
  4980. regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
  4981. default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
  4982. 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
  4983. 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
  4984. ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
  4985. 'height', 'width', 'proto', 'vext', 'abr', 'aext',
  4986. 'fps', 'fs_approx', 'source', 'id')
  4987. settings = {
  4988. 'vcodec': {'type': 'ordered', 'regex': True,
  4989. 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
  4990. 'acodec': {'type': 'ordered', 'regex': True,
  4991. 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
  4992. 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
  4993. 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
  4994. 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
  4995. 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
  4996. 'vext': {'type': 'ordered', 'field': 'video_ext',
  4997. 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
  4998. 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
  4999. 'aext': {'type': 'ordered', 'field': 'audio_ext',
  5000. 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
  5001. 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
  5002. 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
  5003. 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
  5004. 'field': ('vcodec', 'acodec'),
  5005. 'function': lambda it: int(any(v != 'none' for v in it))},
  5006. 'ie_pref': {'priority': True, 'type': 'extractor'},
  5007. 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
  5008. 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
  5009. 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
  5010. 'quality': {'convert': 'float', 'default': -1},
  5011. 'filesize': {'convert': 'bytes'},
  5012. 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
  5013. 'id': {'convert': 'string', 'field': 'format_id'},
  5014. 'height': {'convert': 'float_none'},
  5015. 'width': {'convert': 'float_none'},
  5016. 'fps': {'convert': 'float_none'},
  5017. 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
  5018. 'tbr': {'convert': 'float_none'},
  5019. 'vbr': {'convert': 'float_none'},
  5020. 'abr': {'convert': 'float_none'},
  5021. 'asr': {'convert': 'float_none'},
  5022. 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
  5023. 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
  5024. 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
  5025. 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
  5026. 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
  5027. 'res': {'type': 'multiple', 'field': ('height', 'width'),
  5028. 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
  5029. # Actual field names
  5030. 'format_id': {'type': 'alias', 'field': 'id'},
  5031. 'preference': {'type': 'alias', 'field': 'ie_pref'},
  5032. 'language_preference': {'type': 'alias', 'field': 'lang'},
  5033. 'source_preference': {'type': 'alias', 'field': 'source'},
  5034. 'protocol': {'type': 'alias', 'field': 'proto'},
  5035. 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
  5036. 'audio_channels': {'type': 'alias', 'field': 'channels'},
  5037. # Deprecated
  5038. 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
  5039. 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
  5040. 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
  5041. 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
  5042. 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
  5043. 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
  5044. 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
  5045. 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
  5046. 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
  5047. 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
  5048. 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
  5049. 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
  5050. 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
  5051. 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
  5052. 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  5053. 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  5054. 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  5055. 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  5056. 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  5057. 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  5058. }
  5059. def __init__(self, ydl, field_preference):
  5060. self.ydl = ydl
  5061. self._order = []
  5062. self.evaluate_params(self.ydl.params, field_preference)
  5063. if ydl.params.get('verbose'):
  5064. self.print_verbose_info(self.ydl.write_debug)
  5065. def _get_field_setting(self, field, key):
  5066. if field not in self.settings:
  5067. if key in ('forced', 'priority'):
  5068. return False
  5069. self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
  5070. 'deprecated and may be removed in a future version')
  5071. self.settings[field] = {}
  5072. propObj = self.settings[field]
  5073. if key not in propObj:
  5074. type = propObj.get('type')
  5075. if key == 'field':
  5076. default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
  5077. elif key == 'convert':
  5078. default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
  5079. else:
  5080. default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
  5081. propObj[key] = default
  5082. return propObj[key]
  5083. def _resolve_field_value(self, field, value, convertNone=False):
  5084. if value is None:
  5085. if not convertNone:
  5086. return None
  5087. else:
  5088. value = value.lower()
  5089. conversion = self._get_field_setting(field, 'convert')
  5090. if conversion == 'ignore':
  5091. return None
  5092. if conversion == 'string':
  5093. return value
  5094. elif conversion == 'float_none':
  5095. return float_or_none(value)
  5096. elif conversion == 'bytes':
  5097. return parse_bytes(value)
  5098. elif conversion == 'order':
  5099. order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
  5100. use_regex = self._get_field_setting(field, 'regex')
  5101. list_length = len(order_list)
  5102. empty_pos = order_list.index('') if '' in order_list else list_length + 1
  5103. if use_regex and value is not None:
  5104. for i, regex in enumerate(order_list):
  5105. if regex and re.match(regex, value):
  5106. return list_length - i
  5107. return list_length - empty_pos # not in list
  5108. else: # not regex or value = None
  5109. return list_length - (order_list.index(value) if value in order_list else empty_pos)
  5110. else:
  5111. if value.isnumeric():
  5112. return float(value)
  5113. else:
  5114. self.settings[field]['convert'] = 'string'
  5115. return value
  5116. def evaluate_params(self, params, sort_extractor):
  5117. self._use_free_order = params.get('prefer_free_formats', False)
  5118. self._sort_user = params.get('format_sort', [])
  5119. self._sort_extractor = sort_extractor
  5120. def add_item(field, reverse, closest, limit_text):
  5121. field = field.lower()
  5122. if field in self._order:
  5123. return
  5124. self._order.append(field)
  5125. limit = self._resolve_field_value(field, limit_text)
  5126. data = {
  5127. 'reverse': reverse,
  5128. 'closest': False if limit is None else closest,
  5129. 'limit_text': limit_text,
  5130. 'limit': limit}
  5131. if field in self.settings:
  5132. self.settings[field].update(data)
  5133. else:
  5134. self.settings[field] = data
  5135. sort_list = (
  5136. tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
  5137. + (tuple() if params.get('format_sort_force', False)
  5138. else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
  5139. + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
  5140. for item in sort_list:
  5141. match = re.match(self.regex, item)
  5142. if match is None:
  5143. raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
  5144. field = match.group('field')
  5145. if field is None:
  5146. continue
  5147. if self._get_field_setting(field, 'type') == 'alias':
  5148. alias, field = field, self._get_field_setting(field, 'field')
  5149. if self._get_field_setting(alias, 'deprecated'):
  5150. self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
  5151. f'be removed in a future version. Please use {field} instead')
  5152. reverse = match.group('reverse') is not None
  5153. closest = match.group('separator') == '~'
  5154. limit_text = match.group('limit')
  5155. has_limit = limit_text is not None
  5156. has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
  5157. has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
  5158. fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
  5159. limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
  5160. limit_count = len(limits)
  5161. for (i, f) in enumerate(fields):
  5162. add_item(f, reverse, closest,
  5163. limits[i] if i < limit_count
  5164. else limits[0] if has_limit and not has_multiple_limits
  5165. else None)
  5166. def print_verbose_info(self, write_debug):
  5167. if self._sort_user:
  5168. write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
  5169. if self._sort_extractor:
  5170. write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
  5171. write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
  5172. '+' if self._get_field_setting(field, 'reverse') else '', field,
  5173. '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
  5174. self._get_field_setting(field, 'limit_text'),
  5175. self._get_field_setting(field, 'limit'))
  5176. if self._get_field_setting(field, 'limit_text') is not None else '')
  5177. for field in self._order if self._get_field_setting(field, 'visible')]))
  5178. def _calculate_field_preference_from_value(self, format, field, type, value):
  5179. reverse = self._get_field_setting(field, 'reverse')
  5180. closest = self._get_field_setting(field, 'closest')
  5181. limit = self._get_field_setting(field, 'limit')
  5182. if type == 'extractor':
  5183. maximum = self._get_field_setting(field, 'max')
  5184. if value is None or (maximum is not None and value >= maximum):
  5185. value = -1
  5186. elif type == 'boolean':
  5187. in_list = self._get_field_setting(field, 'in_list')
  5188. not_in_list = self._get_field_setting(field, 'not_in_list')
  5189. value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
  5190. elif type == 'ordered':
  5191. value = self._resolve_field_value(field, value, True)
  5192. # try to convert to number
  5193. val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
  5194. is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
  5195. if is_num:
  5196. value = val_num
  5197. return ((-10, 0) if value is None
  5198. else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
  5199. else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
  5200. else (0, value, 0) if not reverse and (limit is None or value <= limit)
  5201. else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
  5202. else (-1, value, 0))
  5203. def _calculate_field_preference(self, format, field):
  5204. type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
  5205. get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
  5206. if type == 'multiple':
  5207. type = 'field' # Only 'field' is allowed in multiple for now
  5208. actual_fields = self._get_field_setting(field, 'field')
  5209. value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
  5210. else:
  5211. value = get_value(field)
  5212. return self._calculate_field_preference_from_value(format, field, type, value)
  5213. def calculate_preference(self, format):
  5214. # Determine missing protocol
  5215. if not format.get('protocol'):
  5216. format['protocol'] = determine_protocol(format)
  5217. # Determine missing ext
  5218. if not format.get('ext') and 'url' in format:
  5219. format['ext'] = determine_ext(format['url'])
  5220. if format.get('vcodec') == 'none':
  5221. format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
  5222. format['video_ext'] = 'none'
  5223. else:
  5224. format['video_ext'] = format['ext']
  5225. format['audio_ext'] = 'none'
  5226. # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
  5227. # format['preference'] = -1000
  5228. # Determine missing bitrates
  5229. if format.get('tbr') is None:
  5230. if format.get('vbr') is not None and format.get('abr') is not None:
  5231. format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
  5232. else:
  5233. if format.get('vcodec') != 'none' and format.get('vbr') is None:
  5234. format['vbr'] = format.get('tbr') - format.get('abr', 0)
  5235. if format.get('acodec') != 'none' and format.get('abr') is None:
  5236. format['abr'] = format.get('tbr') - format.get('vbr', 0)
  5237. return tuple(self._calculate_field_preference(format, field) for field in self._order)
  5238. # Deprecated
  5239. has_certifi = bool(certifi)
  5240. has_websockets = bool(websockets)