internetarchive.go 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298
  1. // Package internetarchive provides an interface to Internet Archive's Item
  2. // via their native API than using S3-compatible endpoints.
  3. package internetarchive
  4. import (
  5. "bytes"
  6. "context"
  7. "encoding/json"
  8. "errors"
  9. "fmt"
  10. "io"
  11. "net/http"
  12. "net/url"
  13. "path"
  14. "regexp"
  15. "strconv"
  16. "strings"
  17. "time"
  18. "github.com/ncw/swift/v2"
  19. "github.com/rclone/rclone/fs"
  20. "github.com/rclone/rclone/fs/config"
  21. "github.com/rclone/rclone/fs/config/configmap"
  22. "github.com/rclone/rclone/fs/config/configstruct"
  23. "github.com/rclone/rclone/fs/fserrors"
  24. "github.com/rclone/rclone/fs/fshttp"
  25. "github.com/rclone/rclone/fs/hash"
  26. "github.com/rclone/rclone/lib/bucket"
  27. "github.com/rclone/rclone/lib/encoder"
  28. "github.com/rclone/rclone/lib/pacer"
  29. "github.com/rclone/rclone/lib/random"
  30. "github.com/rclone/rclone/lib/rest"
  31. )
  32. // Register with Fs
  33. func init() {
  34. fs.Register(&fs.RegInfo{
  35. Name: "internetarchive",
  36. Description: "Internet Archive",
  37. NewFs: NewFs,
  38. MetadataInfo: &fs.MetadataInfo{
  39. System: map[string]fs.MetadataHelp{
  40. "name": {
  41. Help: "Full file path, without the bucket part",
  42. Type: "filename",
  43. Example: "backend/internetarchive/internetarchive.go",
  44. ReadOnly: true,
  45. },
  46. "source": {
  47. Help: "The source of the file",
  48. Type: "string",
  49. Example: "original",
  50. ReadOnly: true,
  51. },
  52. "mtime": {
  53. Help: "Time of last modification, managed by Rclone",
  54. Type: "RFC 3339",
  55. Example: "2006-01-02T15:04:05.999999999Z",
  56. ReadOnly: true,
  57. },
  58. "size": {
  59. Help: "File size in bytes",
  60. Type: "decimal number",
  61. Example: "123456",
  62. ReadOnly: true,
  63. },
  64. "md5": {
  65. Help: "MD5 hash calculated by Internet Archive",
  66. Type: "string",
  67. Example: "01234567012345670123456701234567",
  68. ReadOnly: true,
  69. },
  70. "crc32": {
  71. Help: "CRC32 calculated by Internet Archive",
  72. Type: "string",
  73. Example: "01234567",
  74. ReadOnly: true,
  75. },
  76. "sha1": {
  77. Help: "SHA1 hash calculated by Internet Archive",
  78. Type: "string",
  79. Example: "0123456701234567012345670123456701234567",
  80. ReadOnly: true,
  81. },
  82. "format": {
  83. Help: "Name of format identified by Internet Archive",
  84. Type: "string",
  85. Example: "Comma-Separated Values",
  86. ReadOnly: true,
  87. },
  88. "old_version": {
  89. Help: "Whether the file was replaced and moved by keep-old-version flag",
  90. Type: "boolean",
  91. Example: "true",
  92. ReadOnly: true,
  93. },
  94. "viruscheck": {
  95. Help: "The last time viruscheck process was run for the file (?)",
  96. Type: "unixtime",
  97. Example: "1654191352",
  98. ReadOnly: true,
  99. },
  100. "summation": {
  101. Help: "Check https://forum.rclone.org/t/31922 for how it is used",
  102. Type: "string",
  103. Example: "md5",
  104. ReadOnly: true,
  105. },
  106. "rclone-ia-mtime": {
  107. Help: "Time of last modification, managed by Internet Archive",
  108. Type: "RFC 3339",
  109. Example: "2006-01-02T15:04:05.999999999Z",
  110. },
  111. "rclone-mtime": {
  112. Help: "Time of last modification, managed by Rclone",
  113. Type: "RFC 3339",
  114. Example: "2006-01-02T15:04:05.999999999Z",
  115. },
  116. "rclone-update-track": {
  117. Help: "Random value used by Rclone for tracking changes inside Internet Archive",
  118. Type: "string",
  119. Example: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
  120. },
  121. },
  122. Help: `Metadata fields provided by Internet Archive.
  123. If there are multiple values for a key, only the first one is returned.
  124. This is a limitation of Rclone, that supports one value per one key.
  125. Owner is able to add custom keys. Metadata feature grabs all the keys including them.
  126. `,
  127. },
  128. Options: []fs.Option{{
  129. Name: "access_key_id",
  130. Help: "IAS3 Access Key.\n\nLeave blank for anonymous access.\nYou can find one here: https://archive.org/account/s3.php",
  131. Sensitive: true,
  132. }, {
  133. Name: "secret_access_key",
  134. Help: "IAS3 Secret Key (password).\n\nLeave blank for anonymous access.",
  135. Sensitive: true,
  136. }, {
  137. // their official client (https://github.com/jjjake/internetarchive) hardcodes following the two
  138. Name: "endpoint",
  139. Help: "IAS3 Endpoint.\n\nLeave blank for default value.",
  140. Default: "https://s3.us.archive.org",
  141. Advanced: true,
  142. }, {
  143. Name: "front_endpoint",
  144. Help: "Host of InternetArchive Frontend.\n\nLeave blank for default value.",
  145. Default: "https://archive.org",
  146. Advanced: true,
  147. }, {
  148. Name: "disable_checksum",
  149. Help: `Don't ask the server to test against MD5 checksum calculated by rclone.
  150. Normally rclone will calculate the MD5 checksum of the input before
  151. uploading it so it can ask the server to check the object against checksum.
  152. This is great for data integrity checking but can cause long delays for
  153. large files to start uploading.`,
  154. Default: true,
  155. Advanced: true,
  156. }, {
  157. Name: "wait_archive",
  158. Help: `Timeout for waiting the server's processing tasks (specifically archive and book_op) to finish.
  159. Only enable if you need to be guaranteed to be reflected after write operations.
  160. 0 to disable waiting. No errors to be thrown in case of timeout.`,
  161. Default: fs.Duration(0),
  162. Advanced: true,
  163. }, {
  164. Name: config.ConfigEncoding,
  165. Help: config.ConfigEncodingHelp,
  166. Advanced: true,
  167. Default: encoder.EncodeZero |
  168. encoder.EncodeSlash |
  169. encoder.EncodeLtGt |
  170. encoder.EncodeCrLf |
  171. encoder.EncodeDel |
  172. encoder.EncodeCtl |
  173. encoder.EncodeInvalidUtf8 |
  174. encoder.EncodeDot,
  175. },
  176. }})
  177. }
  178. // maximum size of an item. this is constant across all items
  179. const iaItemMaxSize int64 = 1099511627776
  180. // metadata keys that are not writeable
  181. var roMetadataKey = map[string]interface{}{
  182. // do not add mtime here, it's a documented exception
  183. "name": nil, "source": nil, "size": nil, "md5": nil,
  184. "crc32": nil, "sha1": nil, "format": nil, "old_version": nil,
  185. "viruscheck": nil, "summation": nil,
  186. }
  187. // Options defines the configuration for this backend
  188. type Options struct {
  189. AccessKeyID string `config:"access_key_id"`
  190. SecretAccessKey string `config:"secret_access_key"`
  191. Endpoint string `config:"endpoint"`
  192. FrontEndpoint string `config:"front_endpoint"`
  193. DisableChecksum bool `config:"disable_checksum"`
  194. WaitArchive fs.Duration `config:"wait_archive"`
  195. Enc encoder.MultiEncoder `config:"encoding"`
  196. }
  197. // Fs represents an IAS3 remote
  198. type Fs struct {
  199. name string // name of this remote
  200. root string // the path we are working on if any
  201. opt Options // parsed config options
  202. features *fs.Features // optional features
  203. srv *rest.Client // the connection to IAS3
  204. front *rest.Client // the connection to frontend
  205. pacer *fs.Pacer // pacer for API calls
  206. ctx context.Context
  207. }
  208. // Object describes a file at IA
  209. type Object struct {
  210. fs *Fs // reference to Fs
  211. remote string // the remote path
  212. modTime time.Time // last modified time
  213. size int64 // size of the file in bytes
  214. md5 string // md5 hash of the file presented by the server
  215. sha1 string // sha1 hash of the file presented by the server
  216. crc32 string // crc32 of the file presented by the server
  217. rawData json.RawMessage
  218. }
  219. // IAFile represents a subset of object in MetadataResponse.Files
  220. type IAFile struct {
  221. Name string `json:"name"`
  222. // Source string `json:"source"`
  223. Mtime string `json:"mtime"`
  224. RcloneMtime json.RawMessage `json:"rclone-mtime"`
  225. UpdateTrack json.RawMessage `json:"rclone-update-track"`
  226. Size string `json:"size"`
  227. Md5 string `json:"md5"`
  228. Crc32 string `json:"crc32"`
  229. Sha1 string `json:"sha1"`
  230. Summation string `json:"summation"`
  231. rawData json.RawMessage
  232. }
  233. // MetadataResponse represents subset of the JSON object returned by (frontend)/metadata/
  234. type MetadataResponse struct {
  235. Files []IAFile `json:"files"`
  236. ItemSize int64 `json:"item_size"`
  237. }
  238. // MetadataResponseRaw is the form of MetadataResponse to deal with metadata
  239. type MetadataResponseRaw struct {
  240. Files []json.RawMessage `json:"files"`
  241. ItemSize int64 `json:"item_size"`
  242. }
  243. // ModMetadataResponse represents response for amending metadata
  244. type ModMetadataResponse struct {
  245. // https://archive.org/services/docs/api/md-write.html#example
  246. Success bool `json:"success"`
  247. Error string `json:"error"`
  248. }
  249. // Name of the remote (as passed into NewFs)
  250. func (f *Fs) Name() string {
  251. return f.name
  252. }
  253. // Root of the remote (as passed into NewFs)
  254. func (f *Fs) Root() string {
  255. return f.root
  256. }
  257. // String converts this Fs to a string
  258. func (f *Fs) String() string {
  259. bucket, file := f.split("")
  260. if bucket == "" {
  261. return "Internet Archive root"
  262. }
  263. if file == "" {
  264. return fmt.Sprintf("Internet Archive item %s", bucket)
  265. }
  266. return fmt.Sprintf("Internet Archive item %s path %s", bucket, file)
  267. }
  268. // Features returns the optional features of this Fs
  269. func (f *Fs) Features() *fs.Features {
  270. return f.features
  271. }
  272. // Hashes returns type of hashes supported by IA
  273. func (f *Fs) Hashes() hash.Set {
  274. return hash.NewHashSet(hash.MD5, hash.SHA1, hash.CRC32)
  275. }
  276. // Precision returns the precision of mtime that the server responds
  277. func (f *Fs) Precision() time.Duration {
  278. if f.opt.WaitArchive == 0 {
  279. return fs.ModTimeNotSupported
  280. }
  281. return time.Nanosecond
  282. }
  283. // retryErrorCodes is a slice of error codes that we will retry
  284. // See: https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html
  285. var retryErrorCodes = []int{
  286. 429, // Too Many Requests
  287. 500, // Internal Server Error - "We encountered an internal error. Please try again."
  288. 503, // Service Unavailable/Slow Down - "Reduce your request rate"
  289. }
  290. // NewFs constructs an Fs from the path
  291. func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) {
  292. // Parse config into Options struct
  293. opt := new(Options)
  294. err := configstruct.Set(m, opt)
  295. if err != nil {
  296. return nil, err
  297. }
  298. // Parse the endpoints
  299. ep, err := url.Parse(opt.Endpoint)
  300. if err != nil {
  301. return nil, err
  302. }
  303. fe, err := url.Parse(opt.FrontEndpoint)
  304. if err != nil {
  305. return nil, err
  306. }
  307. root = strings.Trim(root, "/")
  308. f := &Fs{
  309. name: name,
  310. opt: *opt,
  311. ctx: ctx,
  312. }
  313. f.setRoot(root)
  314. f.features = (&fs.Features{
  315. BucketBased: true,
  316. ReadMetadata: true,
  317. WriteMetadata: true,
  318. UserMetadata: true,
  319. }).Fill(ctx, f)
  320. f.srv = rest.NewClient(fshttp.NewClient(ctx))
  321. f.srv.SetRoot(ep.String())
  322. f.front = rest.NewClient(fshttp.NewClient(ctx))
  323. f.front.SetRoot(fe.String())
  324. if opt.AccessKeyID != "" && opt.SecretAccessKey != "" {
  325. auth := fmt.Sprintf("LOW %s:%s", opt.AccessKeyID, opt.SecretAccessKey)
  326. f.srv.SetHeader("Authorization", auth)
  327. f.front.SetHeader("Authorization", auth)
  328. }
  329. f.pacer = fs.NewPacer(ctx, pacer.NewS3(pacer.MinSleep(10*time.Millisecond)))
  330. // test if the root exists as a file
  331. _, err = f.NewObject(ctx, "/")
  332. if err == nil {
  333. f.setRoot(betterPathDir(root))
  334. return f, fs.ErrorIsFile
  335. }
  336. return f, nil
  337. }
  338. // setRoot changes the root of the Fs
  339. func (f *Fs) setRoot(root string) {
  340. f.root = strings.Trim(root, "/")
  341. }
  342. // Remote returns the remote path
  343. func (o *Object) Remote() string {
  344. return o.remote
  345. }
  346. // ModTime is the last modified time (read-only)
  347. func (o *Object) ModTime(ctx context.Context) time.Time {
  348. return o.modTime
  349. }
  350. // Size is the file length
  351. func (o *Object) Size() int64 {
  352. return o.size
  353. }
  354. // Fs returns the parent Fs
  355. func (o *Object) Fs() fs.Info {
  356. return o.fs
  357. }
  358. // Hash returns the hash value presented by IA
  359. func (o *Object) Hash(ctx context.Context, ty hash.Type) (string, error) {
  360. if ty == hash.MD5 {
  361. return o.md5, nil
  362. }
  363. if ty == hash.SHA1 {
  364. return o.sha1, nil
  365. }
  366. if ty == hash.CRC32 {
  367. return o.crc32, nil
  368. }
  369. return "", hash.ErrUnsupported
  370. }
  371. // Storable returns if this object is storable
  372. func (o *Object) Storable() bool {
  373. return true
  374. }
  375. // SetModTime sets modTime on a particular file
  376. func (o *Object) SetModTime(ctx context.Context, t time.Time) (err error) {
  377. bucket, reqDir := o.split()
  378. if bucket == "" {
  379. return fs.ErrorCantSetModTime
  380. }
  381. if reqDir == "" {
  382. return fs.ErrorCantSetModTime
  383. }
  384. // https://archive.org/services/docs/api/md-write.html
  385. // the following code might be useful for modifying metadata of an uploaded file
  386. patch := []map[string]string{
  387. // we should drop it first to clear all rclone-provided mtimes
  388. {
  389. "op": "remove",
  390. "path": "/rclone-mtime",
  391. }, {
  392. "op": "add",
  393. "path": "/rclone-mtime",
  394. "value": t.Format(time.RFC3339Nano),
  395. }}
  396. res, err := json.Marshal(patch)
  397. if err != nil {
  398. return err
  399. }
  400. params := url.Values{}
  401. params.Add("-target", fmt.Sprintf("files/%s", reqDir))
  402. params.Add("-patch", string(res))
  403. body := []byte(params.Encode())
  404. bodyLen := int64(len(body))
  405. var resp *http.Response
  406. var result ModMetadataResponse
  407. // make a POST request to (frontend)/metadata/:item/
  408. opts := rest.Opts{
  409. Method: "POST",
  410. Path: path.Join("/metadata/", bucket),
  411. Body: bytes.NewReader(body),
  412. ContentLength: &bodyLen,
  413. ContentType: "application/x-www-form-urlencoded",
  414. }
  415. err = o.fs.pacer.Call(func() (bool, error) {
  416. resp, err = o.fs.front.CallJSON(ctx, &opts, nil, &result)
  417. return o.fs.shouldRetry(resp, err)
  418. })
  419. if err != nil {
  420. return err
  421. }
  422. if result.Success {
  423. o.modTime = t
  424. return nil
  425. }
  426. return errors.New(result.Error)
  427. }
  428. // List files and directories in a directory
  429. func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
  430. bucket, reqDir := f.split(dir)
  431. if bucket == "" {
  432. if reqDir != "" {
  433. return nil, fs.ErrorListBucketRequired
  434. }
  435. return entries, nil
  436. }
  437. grandparent := f.opt.Enc.ToStandardPath(strings.Trim(path.Join(bucket, reqDir), "/") + "/")
  438. allEntries, err := f.listAllUnconstrained(ctx, bucket)
  439. if err != nil {
  440. return entries, err
  441. }
  442. for _, ent := range allEntries {
  443. obj, ok := ent.(*Object)
  444. if ok && strings.HasPrefix(obj.remote, grandparent) {
  445. path := trimPathPrefix(obj.remote, grandparent, f.opt.Enc)
  446. if !strings.Contains(path, "/") {
  447. obj.remote = trimPathPrefix(obj.remote, f.root, f.opt.Enc)
  448. entries = append(entries, obj)
  449. }
  450. }
  451. dire, ok := ent.(*fs.Dir)
  452. if ok && strings.HasPrefix(dire.Remote(), grandparent) {
  453. path := trimPathPrefix(dire.Remote(), grandparent, f.opt.Enc)
  454. if !strings.Contains(path, "/") {
  455. dire.SetRemote(trimPathPrefix(dire.Remote(), f.root, f.opt.Enc))
  456. entries = append(entries, dire)
  457. }
  458. }
  459. }
  460. return entries, nil
  461. }
  462. // Mkdir can't be performed on IA like git repositories
  463. func (f *Fs) Mkdir(ctx context.Context, dir string) (err error) {
  464. return nil
  465. }
  466. // Rmdir as well, unless we're asked for recursive deletion
  467. func (f *Fs) Rmdir(ctx context.Context, dir string) error {
  468. return nil
  469. }
  470. // NewObject finds the Object at remote. If it can't be found
  471. // it returns the error fs.ErrorObjectNotFound.
  472. func (f *Fs) NewObject(ctx context.Context, remote string) (ret fs.Object, err error) {
  473. bucket, filepath := f.split(remote)
  474. filepath = strings.Trim(filepath, "/")
  475. if bucket == "" {
  476. if filepath != "" {
  477. return nil, fs.ErrorListBucketRequired
  478. }
  479. return nil, fs.ErrorIsDir
  480. }
  481. grandparent := f.opt.Enc.ToStandardPath(strings.Trim(path.Join(bucket, filepath), "/"))
  482. allEntries, err := f.listAllUnconstrained(ctx, bucket)
  483. if err != nil {
  484. return nil, err
  485. }
  486. for _, ent := range allEntries {
  487. obj, ok := ent.(*Object)
  488. if ok && obj.remote == grandparent {
  489. obj.remote = trimPathPrefix(obj.remote, f.root, f.opt.Enc)
  490. return obj, nil
  491. }
  492. }
  493. return nil, fs.ErrorObjectNotFound
  494. }
  495. // Put uploads a file
  496. func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
  497. o := &Object{
  498. fs: f,
  499. remote: src.Remote(),
  500. modTime: src.ModTime(ctx),
  501. size: src.Size(),
  502. }
  503. err := o.Update(ctx, in, src, options...)
  504. if err == nil {
  505. return o, nil
  506. }
  507. return nil, err
  508. }
  509. // PublicLink generates a public link to the remote path (usually readable by anyone)
  510. func (f *Fs) PublicLink(ctx context.Context, remote string, expire fs.Duration, unlink bool) (link string, err error) {
  511. if strings.HasSuffix(remote, "/") {
  512. return "", fs.ErrorCantShareDirectories
  513. }
  514. if _, err := f.NewObject(ctx, remote); err != nil {
  515. return "", err
  516. }
  517. bucket, bucketPath := f.split(remote)
  518. return path.Join(f.opt.FrontEndpoint, "/download/", bucket, quotePath(bucketPath)), nil
  519. }
  520. // Copy src to this remote using server-side copy operations.
  521. //
  522. // This is stored with the remote path given.
  523. //
  524. // It returns the destination Object and a possible error.
  525. //
  526. // Will only be called if src.Fs().Name() == f.Name()
  527. //
  528. // If it isn't possible then return fs.ErrorCantCopy
  529. func (f *Fs) Copy(ctx context.Context, src fs.Object, remote string) (_ fs.Object, err error) {
  530. dstBucket, dstPath := f.split(remote)
  531. srcObj, ok := src.(*Object)
  532. if !ok {
  533. fs.Debugf(src, "Can't copy - not same remote type")
  534. return nil, fs.ErrorCantCopy
  535. }
  536. srcBucket, srcPath := srcObj.split()
  537. if dstBucket == srcBucket && dstPath == srcPath {
  538. // https://github.com/jjjake/internetarchive/blob/2456376533251df9d05e0a14d796ec1ced4959f5/internetarchive/cli/ia_copy.py#L68
  539. fs.Debugf(src, "Can't copy - the source and destination files cannot be the same!")
  540. return nil, fs.ErrorCantCopy
  541. }
  542. updateTracker := random.String(32)
  543. headers := map[string]string{
  544. "x-archive-auto-make-bucket": "1",
  545. "x-archive-queue-derive": "0",
  546. "x-archive-keep-old-version": "0",
  547. "x-amz-copy-source": quotePath(path.Join("/", srcBucket, srcPath)),
  548. "x-amz-metadata-directive": "COPY",
  549. "x-archive-filemeta-sha1": srcObj.sha1,
  550. "x-archive-filemeta-md5": srcObj.md5,
  551. "x-archive-filemeta-crc32": srcObj.crc32,
  552. "x-archive-filemeta-size": fmt.Sprint(srcObj.size),
  553. // add this too for sure
  554. "x-archive-filemeta-rclone-mtime": srcObj.modTime.Format(time.RFC3339Nano),
  555. "x-archive-filemeta-rclone-update-track": updateTracker,
  556. }
  557. // make a PUT request at (IAS3)/:item/:path without body
  558. var resp *http.Response
  559. opts := rest.Opts{
  560. Method: "PUT",
  561. Path: "/" + url.PathEscape(path.Join(dstBucket, dstPath)),
  562. ExtraHeaders: headers,
  563. }
  564. err = f.pacer.Call(func() (bool, error) {
  565. resp, err = f.srv.Call(ctx, &opts)
  566. return f.shouldRetry(resp, err)
  567. })
  568. if err != nil {
  569. return nil, err
  570. }
  571. // we can't update/find metadata here as IA will also
  572. // queue server-side copy as well as upload/delete.
  573. return f.waitFileUpload(ctx, trimPathPrefix(path.Join(dstBucket, dstPath), f.root, f.opt.Enc), updateTracker, srcObj.size)
  574. }
  575. // ListR lists the objects and directories of the Fs starting
  576. // from dir recursively into out.
  577. //
  578. // dir should be "" to start from the root, and should not
  579. // have trailing slashes.
  580. //
  581. // This should return ErrDirNotFound if the directory isn't
  582. // found.
  583. //
  584. // It should call callback for each tranche of entries read.
  585. // These need not be returned in any particular order. If
  586. // callback returns an error then the listing will stop
  587. // immediately.
  588. //
  589. // Don't implement this unless you have a more efficient way
  590. // of listing recursively than doing a directory traversal.
  591. func (f *Fs) ListR(ctx context.Context, dir string, callback fs.ListRCallback) (err error) {
  592. var allEntries, entries fs.DirEntries
  593. bucket, reqDir := f.split(dir)
  594. if bucket == "" {
  595. if reqDir != "" {
  596. return fs.ErrorListBucketRequired
  597. }
  598. return callback(entries)
  599. }
  600. grandparent := f.opt.Enc.ToStandardPath(strings.Trim(path.Join(bucket, reqDir), "/") + "/")
  601. allEntries, err = f.listAllUnconstrained(ctx, bucket)
  602. if err != nil {
  603. return err
  604. }
  605. for _, ent := range allEntries {
  606. obj, ok := ent.(*Object)
  607. if ok && strings.HasPrefix(obj.remote, grandparent) {
  608. obj.remote = trimPathPrefix(obj.remote, f.root, f.opt.Enc)
  609. entries = append(entries, obj)
  610. }
  611. dire, ok := ent.(*fs.Dir)
  612. if ok && strings.HasPrefix(dire.Remote(), grandparent) {
  613. dire.SetRemote(trimPathPrefix(dire.Remote(), f.root, f.opt.Enc))
  614. entries = append(entries, dire)
  615. }
  616. }
  617. return callback(entries)
  618. }
  619. // CleanUp removes all files inside history/
  620. func (f *Fs) CleanUp(ctx context.Context) (err error) {
  621. bucket, _ := f.split("/")
  622. if bucket == "" {
  623. return fs.ErrorListBucketRequired
  624. }
  625. entries, err := f.listAllUnconstrained(ctx, bucket)
  626. if err != nil {
  627. return err
  628. }
  629. for _, ent := range entries {
  630. obj, ok := ent.(*Object)
  631. if ok && strings.HasPrefix(obj.remote, bucket+"/history/") {
  632. err = obj.Remove(ctx)
  633. if err != nil {
  634. return err
  635. }
  636. }
  637. // we can fully ignore directories, as they're just virtual entries to
  638. // comply with rclone's requirement
  639. }
  640. return nil
  641. }
  642. // About returns things about remaining and used spaces
  643. func (f *Fs) About(ctx context.Context) (_ *fs.Usage, err error) {
  644. bucket, _ := f.split("/")
  645. if bucket == "" {
  646. return nil, fs.ErrorListBucketRequired
  647. }
  648. result, err := f.requestMetadata(ctx, bucket)
  649. if err != nil {
  650. return nil, err
  651. }
  652. // perform low-level operation here since it's ridiculous to make 2 same requests
  653. var historySize int64
  654. for _, ent := range result.Files {
  655. if strings.HasPrefix(ent.Name, "history/") {
  656. size := parseSize(ent.Size)
  657. if size < 0 {
  658. // parse error can be ignored since it's not fatal
  659. continue
  660. }
  661. historySize += size
  662. }
  663. }
  664. usage := &fs.Usage{
  665. Total: fs.NewUsageValue(iaItemMaxSize),
  666. Free: fs.NewUsageValue(iaItemMaxSize - result.ItemSize),
  667. Used: fs.NewUsageValue(result.ItemSize),
  668. Trashed: fs.NewUsageValue(historySize), // bytes in trash
  669. }
  670. return usage, nil
  671. }
  672. // Open an object for read
  673. func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) {
  674. var optionsFixed []fs.OpenOption
  675. for _, opt := range options {
  676. if optRange, ok := opt.(*fs.RangeOption); ok {
  677. // Ignore range option if file is empty
  678. if o.Size() == 0 && optRange.Start == 0 && optRange.End > 0 {
  679. continue
  680. }
  681. }
  682. optionsFixed = append(optionsFixed, opt)
  683. }
  684. var resp *http.Response
  685. // make a GET request to (frontend)/download/:item/:path
  686. opts := rest.Opts{
  687. Method: "GET",
  688. Path: path.Join("/download/", o.fs.root, quotePath(o.fs.opt.Enc.FromStandardPath(o.remote))),
  689. Options: optionsFixed,
  690. }
  691. err = o.fs.pacer.Call(func() (bool, error) {
  692. resp, err = o.fs.front.Call(ctx, &opts)
  693. return o.fs.shouldRetry(resp, err)
  694. })
  695. if err != nil {
  696. return nil, err
  697. }
  698. return resp.Body, nil
  699. }
  700. // Update the Object from in with modTime and size
  701. func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (err error) {
  702. bucket, bucketPath := o.split()
  703. modTime := src.ModTime(ctx)
  704. size := src.Size()
  705. updateTracker := random.String(32)
  706. // Set the mtime in the metadata
  707. // internetarchive backend builds at header level as IAS3 has extension outside X-Amz-
  708. headers := map[string]string{
  709. // https://github.com/jjjake/internetarchive/blob/2456376533251df9d05e0a14d796ec1ced4959f5/internetarchive/iarequest.py#L158
  710. "x-amz-filemeta-rclone-mtime": modTime.Format(time.RFC3339Nano),
  711. "x-amz-filemeta-rclone-update-track": updateTracker,
  712. // we add some more headers for intuitive actions
  713. "x-amz-auto-make-bucket": "1", // create an item if does not exist, do nothing if already
  714. "x-archive-auto-make-bucket": "1", // same as above in IAS3 original way
  715. "x-archive-keep-old-version": "0", // do not keep old versions (a.k.a. trashes in other clouds)
  716. "x-archive-meta-mediatype": "data", // mark media type of the uploading file as "data"
  717. "x-archive-queue-derive": "0", // skip derivation process (e.g. encoding to smaller files, OCR on PDFs)
  718. "x-archive-cascade-delete": "1", // enable "cascate delete" (delete all derived files in addition to the file itself)
  719. }
  720. if size >= 0 {
  721. headers["Content-Length"] = fmt.Sprintf("%d", size)
  722. headers["x-archive-size-hint"] = fmt.Sprintf("%d", size)
  723. }
  724. var mdata fs.Metadata
  725. mdata, err = fs.GetMetadataOptions(ctx, o.fs, src, options)
  726. if err == nil && mdata != nil {
  727. for mk, mv := range mdata {
  728. mk = strings.ToLower(mk)
  729. if strings.HasPrefix(mk, "rclone-") {
  730. fs.LogPrintf(fs.LogLevelWarning, o, "reserved metadata key %s is about to set", mk)
  731. } else if _, ok := roMetadataKey[mk]; ok {
  732. fs.LogPrintf(fs.LogLevelWarning, o, "setting or modifying read-only key %s is requested, skipping", mk)
  733. continue
  734. } else if mk == "mtime" {
  735. // redirect to make it work
  736. mk = "rclone-mtime"
  737. }
  738. headers[fmt.Sprintf("x-amz-filemeta-%s", mk)] = mv
  739. }
  740. }
  741. // read the md5sum if available
  742. var md5sumHex string
  743. if !o.fs.opt.DisableChecksum {
  744. md5sumHex, err = src.Hash(ctx, hash.MD5)
  745. if err == nil && matchMd5.MatchString(md5sumHex) {
  746. // Set the md5sum in header on the object if
  747. // the user wants it
  748. // https://github.com/jjjake/internetarchive/blob/245637653/internetarchive/item.py#L969
  749. headers["Content-MD5"] = md5sumHex
  750. }
  751. }
  752. // make a PUT request at (IAS3)/encoded(:item/:path)
  753. var resp *http.Response
  754. opts := rest.Opts{
  755. Method: "PUT",
  756. Path: "/" + url.PathEscape(path.Join(bucket, bucketPath)),
  757. Body: in,
  758. ContentLength: &size,
  759. ExtraHeaders: headers,
  760. }
  761. err = o.fs.pacer.Call(func() (bool, error) {
  762. resp, err = o.fs.srv.Call(ctx, &opts)
  763. return o.fs.shouldRetry(resp, err)
  764. })
  765. // we can't update/find metadata here as IA will "ingest" uploaded file(s)
  766. // upon uploads. (you can find its progress at https://archive.org/history/ItemNameHere )
  767. // or we have to wait for finish? (needs polling (frontend)/metadata/:item or scraping (frontend)/history/:item)
  768. var newObj *Object
  769. if err == nil {
  770. newObj, err = o.fs.waitFileUpload(ctx, o.remote, updateTracker, size)
  771. } else {
  772. newObj = &Object{}
  773. }
  774. o.crc32 = newObj.crc32
  775. o.md5 = newObj.md5
  776. o.sha1 = newObj.sha1
  777. o.modTime = newObj.modTime
  778. o.size = newObj.size
  779. return err
  780. }
  781. // Remove an object
  782. func (o *Object) Remove(ctx context.Context) (err error) {
  783. bucket, bucketPath := o.split()
  784. // make a DELETE request at (IAS3)/:item/:path
  785. var resp *http.Response
  786. opts := rest.Opts{
  787. Method: "DELETE",
  788. Path: "/" + url.PathEscape(path.Join(bucket, bucketPath)),
  789. }
  790. err = o.fs.pacer.Call(func() (bool, error) {
  791. resp, err = o.fs.srv.Call(ctx, &opts)
  792. return o.fs.shouldRetry(resp, err)
  793. })
  794. // deleting files can take bit longer as
  795. // it'll be processed on same queue as uploads
  796. if err == nil {
  797. err = o.fs.waitDelete(ctx, bucket, bucketPath)
  798. }
  799. return err
  800. }
  801. // String converts this Fs to a string
  802. func (o *Object) String() string {
  803. if o == nil {
  804. return "<nil>"
  805. }
  806. return o.remote
  807. }
  808. // Metadata returns all file metadata provided by Internet Archive
  809. func (o *Object) Metadata(ctx context.Context) (m fs.Metadata, err error) {
  810. if o.rawData == nil {
  811. return nil, nil
  812. }
  813. raw := make(map[string]json.RawMessage)
  814. err = json.Unmarshal(o.rawData, &raw)
  815. if err != nil {
  816. // fatal: json parsing failed
  817. return
  818. }
  819. for k, v := range raw {
  820. items, err := listOrString(v)
  821. if len(items) == 0 || err != nil {
  822. // skip: an entry failed to parse
  823. continue
  824. }
  825. m.Set(k, items[0])
  826. }
  827. // move the old mtime to an another key
  828. if v, ok := m["mtime"]; ok {
  829. m["rclone-ia-mtime"] = v
  830. }
  831. // overwrite with a correct mtime
  832. m["mtime"] = o.modTime.Format(time.RFC3339Nano)
  833. return
  834. }
  835. func (f *Fs) shouldRetry(resp *http.Response, err error) (bool, error) {
  836. if resp != nil {
  837. for _, e := range retryErrorCodes {
  838. if resp.StatusCode == e {
  839. return true, err
  840. }
  841. }
  842. }
  843. // Ok, not an awserr, check for generic failure conditions
  844. return fserrors.ShouldRetry(err), err
  845. }
  846. var matchMd5 = regexp.MustCompile(`^[0-9a-f]{32}$`)
  847. // split returns bucket and bucketPath from the rootRelativePath
  848. // relative to f.root
  849. func (f *Fs) split(rootRelativePath string) (bucketName, bucketPath string) {
  850. bucketName, bucketPath = bucket.Split(path.Join(f.root, rootRelativePath))
  851. return f.opt.Enc.FromStandardName(bucketName), f.opt.Enc.FromStandardPath(bucketPath)
  852. }
  853. // split returns bucket and bucketPath from the object
  854. func (o *Object) split() (bucket, bucketPath string) {
  855. return o.fs.split(o.remote)
  856. }
  857. func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result *MetadataResponse, err error) {
  858. var resp *http.Response
  859. // make a GET request to (frontend)/metadata/:item/
  860. opts := rest.Opts{
  861. Method: "GET",
  862. Path: path.Join("/metadata/", bucket),
  863. }
  864. var temp MetadataResponseRaw
  865. err = f.pacer.Call(func() (bool, error) {
  866. resp, err = f.front.CallJSON(ctx, &opts, nil, &temp)
  867. return f.shouldRetry(resp, err)
  868. })
  869. if err != nil {
  870. return
  871. }
  872. return temp.unraw()
  873. }
  874. // list up all files/directories without any filters
  875. func (f *Fs) listAllUnconstrained(ctx context.Context, bucket string) (entries fs.DirEntries, err error) {
  876. result, err := f.requestMetadata(ctx, bucket)
  877. if err != nil {
  878. return nil, err
  879. }
  880. knownDirs := map[string]time.Time{
  881. "": time.Unix(0, 0),
  882. }
  883. for _, file := range result.Files {
  884. dir := strings.Trim(betterPathDir(file.Name), "/")
  885. nameWithBucket := path.Join(bucket, file.Name)
  886. mtimeTime := file.parseMtime()
  887. // populate children directories
  888. child := dir
  889. for {
  890. if _, ok := knownDirs[child]; ok {
  891. break
  892. }
  893. // directory
  894. d := fs.NewDir(f.opt.Enc.ToStandardPath(path.Join(bucket, child)), mtimeTime)
  895. entries = append(entries, d)
  896. knownDirs[child] = mtimeTime
  897. child = strings.Trim(betterPathDir(child), "/")
  898. }
  899. if _, ok := knownDirs[betterPathDir(file.Name)]; !ok {
  900. continue
  901. }
  902. size := parseSize(file.Size)
  903. o := makeValidObject(f, f.opt.Enc.ToStandardPath(nameWithBucket), file, mtimeTime, size)
  904. entries = append(entries, o)
  905. }
  906. return entries, nil
  907. }
  908. func (f *Fs) waitFileUpload(ctx context.Context, reqPath, tracker string, newSize int64) (ret *Object, err error) {
  909. bucket, bucketPath := f.split(reqPath)
  910. ret = &Object{
  911. fs: f,
  912. remote: trimPathPrefix(path.Join(bucket, bucketPath), f.root, f.opt.Enc),
  913. modTime: time.Unix(0, 0),
  914. size: -1,
  915. }
  916. if f.opt.WaitArchive == 0 {
  917. // user doesn't want to poll, let's not
  918. ret2, err := f.NewObject(ctx, reqPath)
  919. if err == nil {
  920. ret2, ok := ret2.(*Object)
  921. if ok {
  922. ret = ret2
  923. ret.crc32 = ""
  924. ret.md5 = ""
  925. ret.sha1 = ""
  926. ret.size = -1
  927. }
  928. }
  929. return ret, nil
  930. }
  931. retC := make(chan struct {
  932. *Object
  933. error
  934. }, 1)
  935. go func() {
  936. isFirstTime := true
  937. existed := false
  938. for {
  939. if !isFirstTime {
  940. // depending on the queue, it takes time
  941. time.Sleep(10 * time.Second)
  942. }
  943. metadata, err := f.requestMetadata(ctx, bucket)
  944. if err != nil {
  945. retC <- struct {
  946. *Object
  947. error
  948. }{ret, err}
  949. return
  950. }
  951. var iaFile *IAFile
  952. for _, f := range metadata.Files {
  953. if f.Name == bucketPath {
  954. iaFile = &f
  955. break
  956. }
  957. }
  958. if isFirstTime {
  959. isFirstTime = false
  960. existed = iaFile != nil
  961. }
  962. if iaFile == nil {
  963. continue
  964. }
  965. if !existed && !isFirstTime {
  966. // fast path: file wasn't exited before
  967. retC <- struct {
  968. *Object
  969. error
  970. }{makeValidObject2(f, *iaFile, bucket), nil}
  971. return
  972. }
  973. fileTrackers, _ := listOrString(iaFile.UpdateTrack)
  974. trackerMatch := false
  975. for _, v := range fileTrackers {
  976. if v == tracker {
  977. trackerMatch = true
  978. break
  979. }
  980. }
  981. if !trackerMatch {
  982. continue
  983. }
  984. if !compareSize(parseSize(iaFile.Size), newSize) {
  985. continue
  986. }
  987. // voila!
  988. retC <- struct {
  989. *Object
  990. error
  991. }{makeValidObject2(f, *iaFile, bucket), nil}
  992. return
  993. }
  994. }()
  995. select {
  996. case res := <-retC:
  997. return res.Object, res.error
  998. case <-time.After(time.Duration(f.opt.WaitArchive)):
  999. return ret, nil
  1000. }
  1001. }
  1002. func (f *Fs) waitDelete(ctx context.Context, bucket, bucketPath string) (err error) {
  1003. if f.opt.WaitArchive == 0 {
  1004. // user doesn't want to poll, let's not
  1005. return nil
  1006. }
  1007. retC := make(chan error, 1)
  1008. go func() {
  1009. for {
  1010. metadata, err := f.requestMetadata(ctx, bucket)
  1011. if err != nil {
  1012. retC <- err
  1013. return
  1014. }
  1015. found := false
  1016. for _, f := range metadata.Files {
  1017. if f.Name == bucketPath {
  1018. found = true
  1019. break
  1020. }
  1021. }
  1022. if !found {
  1023. retC <- nil
  1024. return
  1025. }
  1026. // depending on the queue, it takes time
  1027. time.Sleep(10 * time.Second)
  1028. }
  1029. }()
  1030. select {
  1031. case res := <-retC:
  1032. return res
  1033. case <-time.After(time.Duration(f.opt.WaitArchive)):
  1034. return nil
  1035. }
  1036. }
  1037. func makeValidObject(f *Fs, remote string, file IAFile, mtime time.Time, size int64) *Object {
  1038. ret := &Object{
  1039. fs: f,
  1040. remote: remote,
  1041. modTime: mtime,
  1042. size: size,
  1043. rawData: file.rawData,
  1044. }
  1045. // hashes from _files.xml (where summation != "") is different from one in other files
  1046. // https://forum.rclone.org/t/internet-archive-md5-tag-in-id-files-xml-interpreted-incorrectly/31922
  1047. if file.Summation == "" {
  1048. ret.md5 = file.Md5
  1049. ret.crc32 = file.Crc32
  1050. ret.sha1 = file.Sha1
  1051. }
  1052. return ret
  1053. }
  1054. func makeValidObject2(f *Fs, file IAFile, bucket string) *Object {
  1055. mtimeTime := file.parseMtime()
  1056. size := parseSize(file.Size)
  1057. return makeValidObject(f, trimPathPrefix(path.Join(bucket, file.Name), f.root, f.opt.Enc), file, mtimeTime, size)
  1058. }
  1059. func listOrString(jm json.RawMessage) (rmArray []string, err error) {
  1060. // rclone-metadata can be an array or string
  1061. // try to deserialize it as array first
  1062. err = json.Unmarshal(jm, &rmArray)
  1063. if err != nil {
  1064. // if not, it's a string
  1065. dst := new(string)
  1066. err = json.Unmarshal(jm, dst)
  1067. if err == nil {
  1068. rmArray = []string{*dst}
  1069. }
  1070. }
  1071. return
  1072. }
  1073. func (file IAFile) parseMtime() (mtime time.Time) {
  1074. // method 1: use metadata added by rclone
  1075. rmArray, err := listOrString(file.RcloneMtime)
  1076. // let's take the first value we can deserialize
  1077. for _, value := range rmArray {
  1078. mtime, err = time.Parse(time.RFC3339Nano, value)
  1079. if err == nil {
  1080. break
  1081. }
  1082. }
  1083. if err != nil {
  1084. // method 2: use metadata added by IAS3
  1085. mtime, err = swift.FloatStringToTime(file.Mtime)
  1086. }
  1087. if err != nil {
  1088. // metadata files don't have some of the fields
  1089. mtime = time.Unix(0, 0)
  1090. }
  1091. return mtime
  1092. }
  1093. func (mrr *MetadataResponseRaw) unraw() (_ *MetadataResponse, err error) {
  1094. var files []IAFile
  1095. for _, raw := range mrr.Files {
  1096. var parsed IAFile
  1097. err = json.Unmarshal(raw, &parsed)
  1098. if err != nil {
  1099. return nil, err
  1100. }
  1101. parsed.rawData = raw
  1102. files = append(files, parsed)
  1103. }
  1104. return &MetadataResponse{
  1105. Files: files,
  1106. ItemSize: mrr.ItemSize,
  1107. }, nil
  1108. }
  1109. func compareSize(a, b int64) bool {
  1110. if a < 0 || b < 0 {
  1111. // we won't compare if any of them is not known
  1112. return true
  1113. }
  1114. return a == b
  1115. }
  1116. func parseSize(str string) int64 {
  1117. size, err := strconv.ParseInt(str, 10, 64)
  1118. if err != nil {
  1119. size = -1
  1120. }
  1121. return size
  1122. }
  1123. func betterPathDir(p string) string {
  1124. d := path.Dir(p)
  1125. if d == "." {
  1126. return ""
  1127. }
  1128. return d
  1129. }
  1130. func betterPathClean(p string) string {
  1131. d := path.Clean(p)
  1132. if d == "." {
  1133. return ""
  1134. }
  1135. return d
  1136. }
  1137. func trimPathPrefix(s, prefix string, enc encoder.MultiEncoder) string {
  1138. // we need to clean the paths to make tests pass!
  1139. s = betterPathClean(s)
  1140. prefix = betterPathClean(prefix)
  1141. if s == prefix || s == prefix+"/" {
  1142. return ""
  1143. }
  1144. prefix = enc.ToStandardPath(strings.TrimRight(prefix, "/"))
  1145. return enc.ToStandardPath(strings.TrimPrefix(s, prefix+"/"))
  1146. }
  1147. // mimics urllib.parse.quote() on Python; exclude / from url.PathEscape
  1148. func quotePath(s string) string {
  1149. seg := strings.Split(s, "/")
  1150. newValues := []string{}
  1151. for _, v := range seg {
  1152. newValues = append(newValues, url.PathEscape(v))
  1153. }
  1154. return strings.Join(newValues, "/")
  1155. }
  1156. var (
  1157. _ fs.Fs = &Fs{}
  1158. _ fs.Copier = &Fs{}
  1159. _ fs.ListRer = &Fs{}
  1160. _ fs.CleanUpper = &Fs{}
  1161. _ fs.PublicLinker = &Fs{}
  1162. _ fs.Abouter = &Fs{}
  1163. _ fs.Object = &Object{}
  1164. _ fs.Metadataer = &Object{}
  1165. )