parse.js 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552
  1. 'use strict'
  2. // this[BUFFER] is the remainder of a chunk if we're waiting for
  3. // the full 512 bytes of a header to come in. We will Buffer.concat()
  4. // it to the next write(), which is a mem copy, but a small one.
  5. //
  6. // this[QUEUE] is a Yallist of entries that haven't been emitted
  7. // yet this can only get filled up if the user keeps write()ing after
  8. // a write() returns false, or does a write() with more than one entry
  9. //
  10. // We don't buffer chunks, we always parse them and either create an
  11. // entry, or push it into the active entry. The ReadEntry class knows
  12. // to throw data away if .ignore=true
  13. //
  14. // Shift entry off the buffer when it emits 'end', and emit 'entry' for
  15. // the next one in the list.
  16. //
  17. // At any time, we're pushing body chunks into the entry at WRITEENTRY,
  18. // and waiting for 'end' on the entry at READENTRY
  19. //
  20. // ignored entries get .resume() called on them straight away
  21. const warner = require('./warn-mixin.js')
  22. const Header = require('./header.js')
  23. const EE = require('events')
  24. const Yallist = require('yallist')
  25. const maxMetaEntrySize = 1024 * 1024
  26. const Entry = require('./read-entry.js')
  27. const Pax = require('./pax.js')
  28. const zlib = require('minizlib')
  29. const { nextTick } = require('process')
  30. const gzipHeader = Buffer.from([0x1f, 0x8b])
  31. const STATE = Symbol('state')
  32. const WRITEENTRY = Symbol('writeEntry')
  33. const READENTRY = Symbol('readEntry')
  34. const NEXTENTRY = Symbol('nextEntry')
  35. const PROCESSENTRY = Symbol('processEntry')
  36. const EX = Symbol('extendedHeader')
  37. const GEX = Symbol('globalExtendedHeader')
  38. const META = Symbol('meta')
  39. const EMITMETA = Symbol('emitMeta')
  40. const BUFFER = Symbol('buffer')
  41. const QUEUE = Symbol('queue')
  42. const ENDED = Symbol('ended')
  43. const EMITTEDEND = Symbol('emittedEnd')
  44. const EMIT = Symbol('emit')
  45. const UNZIP = Symbol('unzip')
  46. const CONSUMECHUNK = Symbol('consumeChunk')
  47. const CONSUMECHUNKSUB = Symbol('consumeChunkSub')
  48. const CONSUMEBODY = Symbol('consumeBody')
  49. const CONSUMEMETA = Symbol('consumeMeta')
  50. const CONSUMEHEADER = Symbol('consumeHeader')
  51. const CONSUMING = Symbol('consuming')
  52. const BUFFERCONCAT = Symbol('bufferConcat')
  53. const MAYBEEND = Symbol('maybeEnd')
  54. const WRITING = Symbol('writing')
  55. const ABORTED = Symbol('aborted')
  56. const DONE = Symbol('onDone')
  57. const SAW_VALID_ENTRY = Symbol('sawValidEntry')
  58. const SAW_NULL_BLOCK = Symbol('sawNullBlock')
  59. const SAW_EOF = Symbol('sawEOF')
  60. const CLOSESTREAM = Symbol('closeStream')
  61. const noop = _ => true
  62. module.exports = warner(class Parser extends EE {
  63. constructor (opt) {
  64. opt = opt || {}
  65. super(opt)
  66. this.file = opt.file || ''
  67. // set to boolean false when an entry starts. 1024 bytes of \0
  68. // is technically a valid tarball, albeit a boring one.
  69. this[SAW_VALID_ENTRY] = null
  70. // these BADARCHIVE errors can't be detected early. listen on DONE.
  71. this.on(DONE, _ => {
  72. if (this[STATE] === 'begin' || this[SAW_VALID_ENTRY] === false) {
  73. // either less than 1 block of data, or all entries were invalid.
  74. // Either way, probably not even a tarball.
  75. this.warn('TAR_BAD_ARCHIVE', 'Unrecognized archive format')
  76. }
  77. })
  78. if (opt.ondone) {
  79. this.on(DONE, opt.ondone)
  80. } else {
  81. this.on(DONE, _ => {
  82. this.emit('prefinish')
  83. this.emit('finish')
  84. this.emit('end')
  85. })
  86. }
  87. this.strict = !!opt.strict
  88. this.maxMetaEntrySize = opt.maxMetaEntrySize || maxMetaEntrySize
  89. this.filter = typeof opt.filter === 'function' ? opt.filter : noop
  90. // Unlike gzip, brotli doesn't have any magic bytes to identify it
  91. // Users need to explicitly tell us they're extracting a brotli file
  92. // Or we infer from the file extension
  93. const isTBR = (opt.file && (
  94. opt.file.endsWith('.tar.br') || opt.file.endsWith('.tbr')))
  95. // if it's a tbr file it MIGHT be brotli, but we don't know until
  96. // we look at it and verify it's not a valid tar file.
  97. this.brotli = !opt.gzip && opt.brotli !== undefined ? opt.brotli
  98. : isTBR ? undefined
  99. : false
  100. // have to set this so that streams are ok piping into it
  101. this.writable = true
  102. this.readable = false
  103. this[QUEUE] = new Yallist()
  104. this[BUFFER] = null
  105. this[READENTRY] = null
  106. this[WRITEENTRY] = null
  107. this[STATE] = 'begin'
  108. this[META] = ''
  109. this[EX] = null
  110. this[GEX] = null
  111. this[ENDED] = false
  112. this[UNZIP] = null
  113. this[ABORTED] = false
  114. this[SAW_NULL_BLOCK] = false
  115. this[SAW_EOF] = false
  116. this.on('end', () => this[CLOSESTREAM]())
  117. if (typeof opt.onwarn === 'function') {
  118. this.on('warn', opt.onwarn)
  119. }
  120. if (typeof opt.onentry === 'function') {
  121. this.on('entry', opt.onentry)
  122. }
  123. }
  124. [CONSUMEHEADER] (chunk, position) {
  125. if (this[SAW_VALID_ENTRY] === null) {
  126. this[SAW_VALID_ENTRY] = false
  127. }
  128. let header
  129. try {
  130. header = new Header(chunk, position, this[EX], this[GEX])
  131. } catch (er) {
  132. return this.warn('TAR_ENTRY_INVALID', er)
  133. }
  134. if (header.nullBlock) {
  135. if (this[SAW_NULL_BLOCK]) {
  136. this[SAW_EOF] = true
  137. // ending an archive with no entries. pointless, but legal.
  138. if (this[STATE] === 'begin') {
  139. this[STATE] = 'header'
  140. }
  141. this[EMIT]('eof')
  142. } else {
  143. this[SAW_NULL_BLOCK] = true
  144. this[EMIT]('nullBlock')
  145. }
  146. } else {
  147. this[SAW_NULL_BLOCK] = false
  148. if (!header.cksumValid) {
  149. this.warn('TAR_ENTRY_INVALID', 'checksum failure', { header })
  150. } else if (!header.path) {
  151. this.warn('TAR_ENTRY_INVALID', 'path is required', { header })
  152. } else {
  153. const type = header.type
  154. if (/^(Symbolic)?Link$/.test(type) && !header.linkpath) {
  155. this.warn('TAR_ENTRY_INVALID', 'linkpath required', { header })
  156. } else if (!/^(Symbolic)?Link$/.test(type) && header.linkpath) {
  157. this.warn('TAR_ENTRY_INVALID', 'linkpath forbidden', { header })
  158. } else {
  159. const entry = this[WRITEENTRY] = new Entry(header, this[EX], this[GEX])
  160. // we do this for meta & ignored entries as well, because they
  161. // are still valid tar, or else we wouldn't know to ignore them
  162. if (!this[SAW_VALID_ENTRY]) {
  163. if (entry.remain) {
  164. // this might be the one!
  165. const onend = () => {
  166. if (!entry.invalid) {
  167. this[SAW_VALID_ENTRY] = true
  168. }
  169. }
  170. entry.on('end', onend)
  171. } else {
  172. this[SAW_VALID_ENTRY] = true
  173. }
  174. }
  175. if (entry.meta) {
  176. if (entry.size > this.maxMetaEntrySize) {
  177. entry.ignore = true
  178. this[EMIT]('ignoredEntry', entry)
  179. this[STATE] = 'ignore'
  180. entry.resume()
  181. } else if (entry.size > 0) {
  182. this[META] = ''
  183. entry.on('data', c => this[META] += c)
  184. this[STATE] = 'meta'
  185. }
  186. } else {
  187. this[EX] = null
  188. entry.ignore = entry.ignore || !this.filter(entry.path, entry)
  189. if (entry.ignore) {
  190. // probably valid, just not something we care about
  191. this[EMIT]('ignoredEntry', entry)
  192. this[STATE] = entry.remain ? 'ignore' : 'header'
  193. entry.resume()
  194. } else {
  195. if (entry.remain) {
  196. this[STATE] = 'body'
  197. } else {
  198. this[STATE] = 'header'
  199. entry.end()
  200. }
  201. if (!this[READENTRY]) {
  202. this[QUEUE].push(entry)
  203. this[NEXTENTRY]()
  204. } else {
  205. this[QUEUE].push(entry)
  206. }
  207. }
  208. }
  209. }
  210. }
  211. }
  212. }
  213. [CLOSESTREAM] () {
  214. nextTick(() => this.emit('close'))
  215. }
  216. [PROCESSENTRY] (entry) {
  217. let go = true
  218. if (!entry) {
  219. this[READENTRY] = null
  220. go = false
  221. } else if (Array.isArray(entry)) {
  222. this.emit.apply(this, entry)
  223. } else {
  224. this[READENTRY] = entry
  225. this.emit('entry', entry)
  226. if (!entry.emittedEnd) {
  227. entry.on('end', _ => this[NEXTENTRY]())
  228. go = false
  229. }
  230. }
  231. return go
  232. }
  233. [NEXTENTRY] () {
  234. do {} while (this[PROCESSENTRY](this[QUEUE].shift()))
  235. if (!this[QUEUE].length) {
  236. // At this point, there's nothing in the queue, but we may have an
  237. // entry which is being consumed (readEntry).
  238. // If we don't, then we definitely can handle more data.
  239. // If we do, and either it's flowing, or it has never had any data
  240. // written to it, then it needs more.
  241. // The only other possibility is that it has returned false from a
  242. // write() call, so we wait for the next drain to continue.
  243. const re = this[READENTRY]
  244. const drainNow = !re || re.flowing || re.size === re.remain
  245. if (drainNow) {
  246. if (!this[WRITING]) {
  247. this.emit('drain')
  248. }
  249. } else {
  250. re.once('drain', _ => this.emit('drain'))
  251. }
  252. }
  253. }
  254. [CONSUMEBODY] (chunk, position) {
  255. // write up to but no more than writeEntry.blockRemain
  256. const entry = this[WRITEENTRY]
  257. const br = entry.blockRemain
  258. const c = (br >= chunk.length && position === 0) ? chunk
  259. : chunk.slice(position, position + br)
  260. entry.write(c)
  261. if (!entry.blockRemain) {
  262. this[STATE] = 'header'
  263. this[WRITEENTRY] = null
  264. entry.end()
  265. }
  266. return c.length
  267. }
  268. [CONSUMEMETA] (chunk, position) {
  269. const entry = this[WRITEENTRY]
  270. const ret = this[CONSUMEBODY](chunk, position)
  271. // if we finished, then the entry is reset
  272. if (!this[WRITEENTRY]) {
  273. this[EMITMETA](entry)
  274. }
  275. return ret
  276. }
  277. [EMIT] (ev, data, extra) {
  278. if (!this[QUEUE].length && !this[READENTRY]) {
  279. this.emit(ev, data, extra)
  280. } else {
  281. this[QUEUE].push([ev, data, extra])
  282. }
  283. }
  284. [EMITMETA] (entry) {
  285. this[EMIT]('meta', this[META])
  286. switch (entry.type) {
  287. case 'ExtendedHeader':
  288. case 'OldExtendedHeader':
  289. this[EX] = Pax.parse(this[META], this[EX], false)
  290. break
  291. case 'GlobalExtendedHeader':
  292. this[GEX] = Pax.parse(this[META], this[GEX], true)
  293. break
  294. case 'NextFileHasLongPath':
  295. case 'OldGnuLongPath':
  296. this[EX] = this[EX] || Object.create(null)
  297. this[EX].path = this[META].replace(/\0.*/, '')
  298. break
  299. case 'NextFileHasLongLinkpath':
  300. this[EX] = this[EX] || Object.create(null)
  301. this[EX].linkpath = this[META].replace(/\0.*/, '')
  302. break
  303. /* istanbul ignore next */
  304. default: throw new Error('unknown meta: ' + entry.type)
  305. }
  306. }
  307. abort (error) {
  308. this[ABORTED] = true
  309. this.emit('abort', error)
  310. // always throws, even in non-strict mode
  311. this.warn('TAR_ABORT', error, { recoverable: false })
  312. }
  313. write (chunk) {
  314. if (this[ABORTED]) {
  315. return
  316. }
  317. // first write, might be gzipped
  318. const needSniff = this[UNZIP] === null ||
  319. this.brotli === undefined && this[UNZIP] === false
  320. if (needSniff && chunk) {
  321. if (this[BUFFER]) {
  322. chunk = Buffer.concat([this[BUFFER], chunk])
  323. this[BUFFER] = null
  324. }
  325. if (chunk.length < gzipHeader.length) {
  326. this[BUFFER] = chunk
  327. return true
  328. }
  329. // look for gzip header
  330. for (let i = 0; this[UNZIP] === null && i < gzipHeader.length; i++) {
  331. if (chunk[i] !== gzipHeader[i]) {
  332. this[UNZIP] = false
  333. }
  334. }
  335. const maybeBrotli = this.brotli === undefined
  336. if (this[UNZIP] === false && maybeBrotli) {
  337. // read the first header to see if it's a valid tar file. If so,
  338. // we can safely assume that it's not actually brotli, despite the
  339. // .tbr or .tar.br file extension.
  340. // if we ended before getting a full chunk, yes, def brotli
  341. if (chunk.length < 512) {
  342. if (this[ENDED]) {
  343. this.brotli = true
  344. } else {
  345. this[BUFFER] = chunk
  346. return true
  347. }
  348. } else {
  349. // if it's tar, it's pretty reliably not brotli, chances of
  350. // that happening are astronomical.
  351. try {
  352. new Header(chunk.slice(0, 512))
  353. this.brotli = false
  354. } catch (_) {
  355. this.brotli = true
  356. }
  357. }
  358. }
  359. if (this[UNZIP] === null || (this[UNZIP] === false && this.brotli)) {
  360. const ended = this[ENDED]
  361. this[ENDED] = false
  362. this[UNZIP] = this[UNZIP] === null
  363. ? new zlib.Unzip()
  364. : new zlib.BrotliDecompress()
  365. this[UNZIP].on('data', chunk => this[CONSUMECHUNK](chunk))
  366. this[UNZIP].on('error', er => this.abort(er))
  367. this[UNZIP].on('end', _ => {
  368. this[ENDED] = true
  369. this[CONSUMECHUNK]()
  370. })
  371. this[WRITING] = true
  372. const ret = this[UNZIP][ended ? 'end' : 'write'](chunk)
  373. this[WRITING] = false
  374. return ret
  375. }
  376. }
  377. this[WRITING] = true
  378. if (this[UNZIP]) {
  379. this[UNZIP].write(chunk)
  380. } else {
  381. this[CONSUMECHUNK](chunk)
  382. }
  383. this[WRITING] = false
  384. // return false if there's a queue, or if the current entry isn't flowing
  385. const ret =
  386. this[QUEUE].length ? false :
  387. this[READENTRY] ? this[READENTRY].flowing :
  388. true
  389. // if we have no queue, then that means a clogged READENTRY
  390. if (!ret && !this[QUEUE].length) {
  391. this[READENTRY].once('drain', _ => this.emit('drain'))
  392. }
  393. return ret
  394. }
  395. [BUFFERCONCAT] (c) {
  396. if (c && !this[ABORTED]) {
  397. this[BUFFER] = this[BUFFER] ? Buffer.concat([this[BUFFER], c]) : c
  398. }
  399. }
  400. [MAYBEEND] () {
  401. if (this[ENDED] &&
  402. !this[EMITTEDEND] &&
  403. !this[ABORTED] &&
  404. !this[CONSUMING]) {
  405. this[EMITTEDEND] = true
  406. const entry = this[WRITEENTRY]
  407. if (entry && entry.blockRemain) {
  408. // truncated, likely a damaged file
  409. const have = this[BUFFER] ? this[BUFFER].length : 0
  410. this.warn('TAR_BAD_ARCHIVE', `Truncated input (needed ${
  411. entry.blockRemain} more bytes, only ${have} available)`, { entry })
  412. if (this[BUFFER]) {
  413. entry.write(this[BUFFER])
  414. }
  415. entry.end()
  416. }
  417. this[EMIT](DONE)
  418. }
  419. }
  420. [CONSUMECHUNK] (chunk) {
  421. if (this[CONSUMING]) {
  422. this[BUFFERCONCAT](chunk)
  423. } else if (!chunk && !this[BUFFER]) {
  424. this[MAYBEEND]()
  425. } else {
  426. this[CONSUMING] = true
  427. if (this[BUFFER]) {
  428. this[BUFFERCONCAT](chunk)
  429. const c = this[BUFFER]
  430. this[BUFFER] = null
  431. this[CONSUMECHUNKSUB](c)
  432. } else {
  433. this[CONSUMECHUNKSUB](chunk)
  434. }
  435. while (this[BUFFER] &&
  436. this[BUFFER].length >= 512 &&
  437. !this[ABORTED] &&
  438. !this[SAW_EOF]) {
  439. const c = this[BUFFER]
  440. this[BUFFER] = null
  441. this[CONSUMECHUNKSUB](c)
  442. }
  443. this[CONSUMING] = false
  444. }
  445. if (!this[BUFFER] || this[ENDED]) {
  446. this[MAYBEEND]()
  447. }
  448. }
  449. [CONSUMECHUNKSUB] (chunk) {
  450. // we know that we are in CONSUMING mode, so anything written goes into
  451. // the buffer. Advance the position and put any remainder in the buffer.
  452. let position = 0
  453. const length = chunk.length
  454. while (position + 512 <= length && !this[ABORTED] && !this[SAW_EOF]) {
  455. switch (this[STATE]) {
  456. case 'begin':
  457. case 'header':
  458. this[CONSUMEHEADER](chunk, position)
  459. position += 512
  460. break
  461. case 'ignore':
  462. case 'body':
  463. position += this[CONSUMEBODY](chunk, position)
  464. break
  465. case 'meta':
  466. position += this[CONSUMEMETA](chunk, position)
  467. break
  468. /* istanbul ignore next */
  469. default:
  470. throw new Error('invalid state: ' + this[STATE])
  471. }
  472. }
  473. if (position < length) {
  474. if (this[BUFFER]) {
  475. this[BUFFER] = Buffer.concat([chunk.slice(position), this[BUFFER]])
  476. } else {
  477. this[BUFFER] = chunk.slice(position)
  478. }
  479. }
  480. }
  481. end (chunk) {
  482. if (!this[ABORTED]) {
  483. if (this[UNZIP]) {
  484. this[UNZIP].end(chunk)
  485. } else {
  486. this[ENDED] = true
  487. if (this.brotli === undefined) chunk = chunk || Buffer.alloc(0)
  488. this.write(chunk)
  489. }
  490. }
  491. }
  492. })