edge/pkg/bundle/zim/article.go

package zim

import (
	"bytes"
	"errors"
	"fmt"
	"io"
	"io/ioutil"
	"strings"
	"sync"

	lru "github.com/hashicorp/golang-lru/v2"
)

const (
	RedirectEntry   uint16 = 0xffff
	LinkTargetEntry        = 0xfffe
	DeletedEntry           = 0xfffd
)

var articlePool sync.Pool

// the recent uncompressed blobs, mainly useful while indexing and asking
// for the same blob again and again
var bcache *lru.Cache[any, any]

type Article struct {
	// EntryType is a RedirectEntry/LinkTargetEntry/DeletedEntry or an idx
	// pointing to ZimReader.mimeTypeList
	EntryType uint16
	Title     string
	URLPtr    uint64
	Namespace byte
	url       string
	blob      uint32
	cluster   uint32
	z         *ZimReader
}

// convenient method to return the Article at URL index idx
func (z *ZimReader) ArticleAtURLIdx(idx uint32) (*Article, error) {
	o, err := z.OffsetAtURLIdx(idx)
	if err != nil {
		return nil, err
	}
	return z.ArticleAt(o)
}

// return the article main page if it exists
func (z *ZimReader) MainPage() (*Article, error) {
	if z.mainPage == 0xffffffff {
		return nil, nil
	}
	return z.ArticleAtURLIdx(z.mainPage)
}

// get the article (Directory) pointed by the offset found in URLpos or Titlepos
func (z *ZimReader) ArticleAt(offset uint64) (*Article, error) {
	a := articlePool.Get().(*Article)
	err := z.FillArticleAt(a, offset)
	return a, err
}

// Fill an article with datas found at offset
func (z *ZimReader) FillArticleAt(a *Article, offset uint64) error {
	a.z = z
	a.URLPtr = offset

	mimeIdx, err := readInt16(z.bytesRangeAt(offset, offset+2))
	if err != nil {
		return fmt.Errorf("can't read article %w", err)
	}
	a.EntryType = mimeIdx

	// Linktarget or Target Entry
	if mimeIdx == LinkTargetEntry || mimeIdx == DeletedEntry {
		// TODO
		return nil
	}

	s, err := z.bytesRangeAt(offset+3, offset+4)
	if err != nil {
		return err
	}
	a.Namespace = s[0]

	a.cluster, err = readInt32(z.bytesRangeAt(offset+8, offset+8+4))
	if err != nil {
		return err
	}
	a.blob, err = readInt32(z.bytesRangeAt(offset+12, offset+12+4))
	if err != nil {
		return err
	}

	// Redirect
	if mimeIdx == RedirectEntry {
		// assume the url + title won't be longer than 2k
		b, err := z.bytesRangeAt(offset+12, offset+12+2048)
		if err != nil {
			return nil
		}
		bbuf := bytes.NewBuffer(b)
		a.url, err = bbuf.ReadString('\x00')
		if err != nil {
			return err
		}
		a.url = strings.TrimRight(a.url, "\x00")

		a.Title, err = bbuf.ReadString('\x00')
		if err != nil {
			return err
		}
		a.Title = strings.TrimRight(a.Title, "\x00")
		return err
	}

	b, err := z.bytesRangeAt(offset+16, offset+16+2048)
	if err != nil {
		return nil
	}
	bbuf := bytes.NewBuffer(b)
	a.url, err = bbuf.ReadString('\x00')
	if err != nil {
		return err
	}

	a.url = strings.TrimRight(string(a.url), "\x00")

	title, err := bbuf.ReadString('\x00')
	if err != nil {
		return err
	}
	title = strings.TrimRight(string(title), "\x00")
	// This is a trick to force a copy and avoid retain of the full buffer
	// mainly for indexing title reasons
	if len(title) != 0 {
		a.Title = title[0:1] + title[1:]
	}
	return nil
}

// return the uncompressed data associated with this article
func (a *Article) Data() ([]byte, error) {
	// ensure we have data to read
	if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry {
		return nil, nil
	}
	start, end, err := a.z.clusterOffsetsAtIdx(a.cluster)
	if err != nil {
		return nil, err
	}
	s, err := a.z.bytesRangeAt(start, start+1)
	if err != nil {
		return nil, err
	}
	compression := uint8(s[0])

	// blob starts at offset, blob ends at offset
	var bs, be uint32

	// LZMA: 4, Zstandard: 5
	if compression == 4 || compression == 5 {
		blobLookup := func() ([]byte, bool) {
			if v, ok := bcache.Get(a.cluster); ok {
				b := v.([]byte)
				return b, ok
			}
			return nil, false
		}

		var blob []byte
		var ok bool
		var dec io.ReadCloser
		if blob, ok = blobLookup(); !ok {
			b, err := a.z.bytesRangeAt(start+1, end+1)
			if err != nil {
				return nil, err
			}
			bbuf := bytes.NewBuffer(b)
			switch compression {
			case 5:
				dec, err = NewZstdReader(bbuf)

			case 4:
				dec, err = NewXZReader(bbuf)
			}
			if err != nil {
				return nil, err
			}
			defer dec.Close()
			// the decoded chunk are around 1MB
			b, err = ioutil.ReadAll(dec)
			if err != nil {
				return nil, err
			}
			blob = make([]byte, len(b))
			copy(blob, b)
			// TODO: 2 requests for the same blob could occure at the same time
			bcache.Add(a.cluster, blob)
		} else {
			bi, ok := bcache.Get(a.cluster)
			if !ok {
				return nil, errors.New("not in cache anymore")
			}
			blob = bi.([]byte)
		}

		bs, err = readInt32(blob[a.blob*4:a.blob*4+4], nil)
		if err != nil {
			return nil, err
		}
		be, err = readInt32(blob[a.blob*4+4:a.blob*4+4+4], nil)
		if err != nil {
			return nil, err
		}

		// avoid retaining all the chunk
		c := make([]byte, be-bs)
		copy(c, blob[bs:be])
		return c, nil

	} else if compression == 0 || compression == 1 {
		// uncompresssed
		startPos := start + 1
		blobOffset := uint64(a.blob * 4)

		bs, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset, startPos+blobOffset+4))
		if err != nil {
			return nil, err
		}

		be, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset+4, startPos+blobOffset+4+4))
		if err != nil {
			return nil, err
		}

		return a.z.bytesRangeAt(startPos+uint64(bs), startPos+uint64(be))
	}

	return nil, errors.New("Unhandled compression")
}

func (a *Article) MimeType() string {
	if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry {
		return ""
	}

	return a.z.mimeTypeList[a.EntryType]
}

// return the url prefixed by the namespace
func (a *Article) FullURL() string {
	return string(a.Namespace) + "/" + a.url
}

func (a *Article) String() string {
	return fmt.Sprintf("Mime: 0x%x URL: [%s], Title: [%s], Cluster: 0x%x Blob: 0x%x",
		a.EntryType, a.FullURL(), a.Title, a.cluster, a.blob)
}

// RedirectIndex return the redirect index of RedirectEntry type article
// return an err if not a redirect entry
func (a *Article) RedirectIndex() (uint32, error) {
	if a.EntryType != RedirectEntry {
		return 0, errors.New("Not a RedirectEntry")
	}
	// We use the cluster to save the redirect index position for RedirectEntry type
	return a.cluster, nil
}

func (a *Article) blobOffsetsAtIdx(z *ZimReader) (start, end uint64) {
	idx := a.blob
	offset := z.clusterPtrPos + uint64(idx)*8
	start, err := readInt64(z.bytesRangeAt(offset, offset+8))
	if err != nil {
		return
	}
	offset = z.clusterPtrPos + uint64(idx+1)*8
	end, _ = readInt64(z.bytesRangeAt(offset, offset+8))

	return
}
feat: basic zim support 2023-07-11 02:42:05 +02:00			`package zim`

			`import (`
			`"bytes"`
			`"errors"`
			`"fmt"`
			`"io"`
			`"io/ioutil"`
			`"strings"`
			`"sync"`

			`lru "github.com/hashicorp/golang-lru/v2"`
			`)`

			`const (`
			`RedirectEntry uint16 = 0xffff`
			`LinkTargetEntry = 0xfffe`
			`DeletedEntry = 0xfffd`
			`)`

			`var articlePool sync.Pool`

			`// the recent uncompressed blobs, mainly useful while indexing and asking`
			`// for the same blob again and again`
			`var bcache *lru.Cache[any, any]`

			`type Article struct {`
			`// EntryType is a RedirectEntry/LinkTargetEntry/DeletedEntry or an idx`
			`// pointing to ZimReader.mimeTypeList`
			`EntryType uint16`
			`Title string`
			`URLPtr uint64`
			`Namespace byte`
			`url string`
			`blob uint32`
			`cluster uint32`
			`z *ZimReader`
			`}`

			`// convenient method to return the Article at URL index idx`
			`func (z ZimReader) ArticleAtURLIdx(idx uint32) (Article, error) {`
			`o, err := z.OffsetAtURLIdx(idx)`
			`if err != nil {`
			`return nil, err`
			`}`
			`return z.ArticleAt(o)`
			`}`

			`// return the article main page if it exists`
			`func (z ZimReader) MainPage() (Article, error) {`
			`if z.mainPage == 0xffffffff {`
			`return nil, nil`
			`}`
			`return z.ArticleAtURLIdx(z.mainPage)`
			`}`

			`// get the article (Directory) pointed by the offset found in URLpos or Titlepos`
			`func (z ZimReader) ArticleAt(offset uint64) (Article, error) {`
			`a := articlePool.Get().(*Article)`
			`err := z.FillArticleAt(a, offset)`
			`return a, err`
			`}`

			`// Fill an article with datas found at offset`
			`func (z ZimReader) FillArticleAt(a Article, offset uint64) error {`
			`a.z = z`
			`a.URLPtr = offset`

			`mimeIdx, err := readInt16(z.bytesRangeAt(offset, offset+2))`
			`if err != nil {`
			`return fmt.Errorf("can't read article %w", err)`
			`}`
			`a.EntryType = mimeIdx`

			`// Linktarget or Target Entry`
			`if mimeIdx == LinkTargetEntry \|\| mimeIdx == DeletedEntry {`
			`// TODO`
			`return nil`
			`}`

			`s, err := z.bytesRangeAt(offset+3, offset+4)`
			`if err != nil {`
			`return err`
			`}`
			`a.Namespace = s[0]`

			`a.cluster, err = readInt32(z.bytesRangeAt(offset+8, offset+8+4))`
			`if err != nil {`
			`return err`
			`}`
			`a.blob, err = readInt32(z.bytesRangeAt(offset+12, offset+12+4))`
			`if err != nil {`
			`return err`
			`}`

			`// Redirect`
			`if mimeIdx == RedirectEntry {`
			`// assume the url + title won't be longer than 2k`
			`b, err := z.bytesRangeAt(offset+12, offset+12+2048)`
			`if err != nil {`
			`return nil`
			`}`
			`bbuf := bytes.NewBuffer(b)`
			`a.url, err = bbuf.ReadString('\x00')`
			`if err != nil {`
			`return err`
			`}`
			`a.url = strings.TrimRight(a.url, "\x00")`

			`a.Title, err = bbuf.ReadString('\x00')`
			`if err != nil {`
			`return err`
			`}`
			`a.Title = strings.TrimRight(a.Title, "\x00")`
			`return err`
			`}`

			`b, err := z.bytesRangeAt(offset+16, offset+16+2048)`
			`if err != nil {`
			`return nil`
			`}`
			`bbuf := bytes.NewBuffer(b)`
			`a.url, err = bbuf.ReadString('\x00')`
			`if err != nil {`
			`return err`
			`}`

			`a.url = strings.TrimRight(string(a.url), "\x00")`

			`title, err := bbuf.ReadString('\x00')`
			`if err != nil {`
			`return err`
			`}`
			`title = strings.TrimRight(string(title), "\x00")`
			`// This is a trick to force a copy and avoid retain of the full buffer`
			`// mainly for indexing title reasons`
			`if len(title) != 0 {`
			`a.Title = title[0:1] + title[1:]`
			`}`
			`return nil`
			`}`

			`// return the uncompressed data associated with this article`
			`func (a *Article) Data() ([]byte, error) {`
			`// ensure we have data to read`
			`if a.EntryType == RedirectEntry \|\| a.EntryType == LinkTargetEntry \|\| a.EntryType == DeletedEntry {`
			`return nil, nil`
			`}`
			`start, end, err := a.z.clusterOffsetsAtIdx(a.cluster)`
			`if err != nil {`
			`return nil, err`
			`}`
			`s, err := a.z.bytesRangeAt(start, start+1)`
			`if err != nil {`
			`return nil, err`
			`}`
			`compression := uint8(s[0])`

			`// blob starts at offset, blob ends at offset`
			`var bs, be uint32`

			`// LZMA: 4, Zstandard: 5`
			`if compression == 4 \|\| compression == 5 {`
			`blobLookup := func() ([]byte, bool) {`
			`if v, ok := bcache.Get(a.cluster); ok {`
			`b := v.([]byte)`
			`return b, ok`
			`}`
			`return nil, false`
			`}`

			`var blob []byte`
			`var ok bool`
			`var dec io.ReadCloser`
			`if blob, ok = blobLookup(); !ok {`
			`b, err := a.z.bytesRangeAt(start+1, end+1)`
			`if err != nil {`
			`return nil, err`
			`}`
			`bbuf := bytes.NewBuffer(b)`
			`switch compression {`
			`case 5:`
			`dec, err = NewZstdReader(bbuf)`

			`case 4:`
			`dec, err = NewXZReader(bbuf)`
			`}`
			`if err != nil {`
			`return nil, err`
			`}`
			`defer dec.Close()`
			`// the decoded chunk are around 1MB`
			`b, err = ioutil.ReadAll(dec)`
			`if err != nil {`
			`return nil, err`
			`}`
			`blob = make([]byte, len(b))`
			`copy(blob, b)`
			`// TODO: 2 requests for the same blob could occure at the same time`
			`bcache.Add(a.cluster, blob)`
			`} else {`
			`bi, ok := bcache.Get(a.cluster)`
			`if !ok {`
			`return nil, errors.New("not in cache anymore")`
			`}`
			`blob = bi.([]byte)`
			`}`

			`bs, err = readInt32(blob[a.blob4:a.blob4+4], nil)`
			`if err != nil {`
			`return nil, err`
			`}`
			`be, err = readInt32(blob[a.blob4+4:a.blob4+4+4], nil)`
			`if err != nil {`
			`return nil, err`
			`}`

			`// avoid retaining all the chunk`
			`c := make([]byte, be-bs)`
			`copy(c, blob[bs:be])`
			`return c, nil`

			`} else if compression == 0 \|\| compression == 1 {`
			`// uncompresssed`
			`startPos := start + 1`
			`blobOffset := uint64(a.blob * 4)`

			`bs, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset, startPos+blobOffset+4))`
			`if err != nil {`
			`return nil, err`
			`}`

			`be, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset+4, startPos+blobOffset+4+4))`
			`if err != nil {`
			`return nil, err`
			`}`

			`return a.z.bytesRangeAt(startPos+uint64(bs), startPos+uint64(be))`
			`}`

			`return nil, errors.New("Unhandled compression")`
			`}`

			`func (a *Article) MimeType() string {`
			`if a.EntryType == RedirectEntry \|\| a.EntryType == LinkTargetEntry \|\| a.EntryType == DeletedEntry {`
			`return ""`
			`}`

			`return a.z.mimeTypeList[a.EntryType]`
			`}`

			`// return the url prefixed by the namespace`
			`func (a *Article) FullURL() string {`
			`return string(a.Namespace) + "/" + a.url`
			`}`

			`func (a *Article) String() string {`
			`return fmt.Sprintf("Mime: 0x%x URL: [%s], Title: [%s], Cluster: 0x%x Blob: 0x%x",`
			`a.EntryType, a.FullURL(), a.Title, a.cluster, a.blob)`
			`}`

			`// RedirectIndex return the redirect index of RedirectEntry type article`
			`// return an err if not a redirect entry`
			`func (a *Article) RedirectIndex() (uint32, error) {`
			`if a.EntryType != RedirectEntry {`
			`return 0, errors.New("Not a RedirectEntry")`
			`}`
			`// We use the cluster to save the redirect index position for RedirectEntry type`
			`return a.cluster, nil`
			`}`

			`func (a Article) blobOffsetsAtIdx(z ZimReader) (start, end uint64) {`
			`idx := a.blob`
			`offset := z.clusterPtrPos + uint64(idx)*8`
			`start, err := readInt64(z.bytesRangeAt(offset, offset+8))`
			`if err != nil {`
			`return`
			`}`
			`offset = z.clusterPtrPos + uint64(idx+1)*8`
			`end, _ = readInt64(z.bytesRangeAt(offset, offset+8))`

			`return`
			`}`