edge/pkg/bundle/zim/zim.go

package zim

import (
	"bytes"
	"fmt"
	"io"
	"os"
	"strings"
	"sync"

	lru "github.com/hashicorp/golang-lru/v2"
	"github.com/pkg/errors"
)

const (
	zimHeader = 72173914
)

// ZimReader keep tracks of everything related to ZIM reading
type ZimReader struct {
	f             *os.File
	UUID          uint32
	ArticleCount  uint32
	clusterCount  uint32
	urlPtrPos     uint64
	titlePtrPos   uint64
	clusterPtrPos uint64
	mimeListPos   uint64
	mainPage      uint32
	layoutPage    uint32
	mimeTypeList  []string
}

// create a new zim reader
func NewReader(path string) (*ZimReader, error) {
	f, err := os.Open(path)
	if err != nil {
		return nil, err
	}
	z := ZimReader{f: f, mainPage: 0xffffffff, layoutPage: 0xffffffff}

	articlePool = sync.Pool{
		New: func() interface{} {
			return new(Article)
		},
	}
	// keep 4 latest uncompressed blobs, around 1M per blob
	bcache, _ = lru.New[any, any](5)

	err = z.readFileHeaders()
	return &z, err
}

// Return an ordered list of mime types present in the ZIM file
func (z *ZimReader) MimeTypes() []string {
	if len(z.mimeTypeList) != 0 {
		return z.mimeTypeList
	}

	var s []string
	// assume mime list fit in 2k
	b, err := z.bytesRangeAt(z.mimeListPos, z.mimeListPos+2048)
	if err != nil {
		return s
	}
	bbuf := bytes.NewBuffer(b)

	for {
		line, err := bbuf.ReadBytes('\x00')
		if err != nil && err != io.EOF {
			return s
		}
		// a line of 1 is a line containing only \x00 and it's the marker for the
		// end of mime types list
		if len(line) == 1 {
			break
		}
		s = append(s, strings.TrimRight(string(line), "\x00"))
	}
	z.mimeTypeList = s
	return s
}

// list all articles, using url index, contained in a zim file
// note that this is a slow implementation, a real iterator is faster
// you are not suppose to use this method on big zim files, use indexes
func (z *ZimReader) ListArticles() <-chan *Article {
	ch := make(chan *Article, 10)

	go func() {
		var idx uint32
		// starting at 1 to avoid "con" entry
		var start uint32 = 1

		for idx = start; idx < z.ArticleCount; idx++ {
			art, err := z.ArticleAtURLIdx(idx)
			if err != nil {
				continue
			}

			if art == nil {
				// TODO: deal with redirect continue
			}
			ch <- art
		}
		close(ch)
	}()
	return ch
}

// list all title pointer, Titles by position contained in a zim file
// Titles are pointers to URLpos index, useful for indexing cause smaller to store: uint32
// note that this is a slow implementation, a real iterator is faster
// you are not suppose to use this method on big zim files prefer ListTitlesPtrIterator to build your index
func (z *ZimReader) ListTitlesPtr() <-chan uint32 {
	ch := make(chan uint32, 10)

	go func() {
		var pos uint64
		var count uint32

		for pos = z.titlePtrPos; count < z.ArticleCount; pos += 4 {
			idx, err := readInt32(z.bytesRangeAt(pos, pos+4))
			if err != nil {
				continue
			}
			ch <- idx
			count++
		}
		close(ch)
	}()
	return ch
}

// list all title pointer, Titles by position contained in a zim file
// Titles are pointers to URLpos index, usefull for indexing cause smaller to store: uint32
func (z *ZimReader) ListTitlesPtrIterator(cb func(uint32)) {
	var count uint32
	for pos := z.titlePtrPos; count < z.ArticleCount; pos += 4 {
		idx, err := readInt32(z.bytesRangeAt(pos, pos+4))
		if err != nil {
			continue
		}
		cb(idx)
		count++
	}
}

// return the article at the exact url not using any index
func (z *ZimReader) GetPageNoIndex(url string) (*Article, error) {
	// starting at 1 to avoid "con" entry
	var start uint32
	stop := z.ArticleCount

	a := new(Article)

	for {
		pos := (start + stop) / 2

		offset, err := z.OffsetAtURLIdx(pos)
		if err != nil {
			return nil, err
		}
		err = z.FillArticleAt(a, offset)
		if err != nil {
			return nil, err
		}

		if a.FullURL() == url {
			return a, nil
		}

		if a.FullURL() > url {
			stop = pos
		} else {
			start = pos
		}
		if stop-start == 1 {
			break
		}

	}
	return nil, errors.WithStack(ErrNotFound)
}

// get the offset pointing to Article at pos in the URL idx
func (z *ZimReader) OffsetAtURLIdx(idx uint32) (uint64, error) {
	offset := z.urlPtrPos + uint64(idx)*8
	return readInt64(z.bytesRangeAt(offset, offset+8))
}

// Close & cleanup the zimreader
func (z *ZimReader) Close() error {
	return z.f.Close()
}

func (z *ZimReader) String() string {
	fi, err := z.f.Stat()
	if err != nil {
		return "corrupted zim"
	}
	return fmt.Sprintf("Size: %d, ArticleCount: %d urlPtrPos: 0x%x titlePtrPos: 0x%x mimeListPos: 0x%x clusterPtrPos: 0x%x\nMimeTypes: %v",
		fi.Size(), z.ArticleCount, z.urlPtrPos, z.titlePtrPos, z.mimeListPos, z.clusterPtrPos, z.MimeTypes())
}

// getBytesRangeAt returns bytes from start to end
// it's needed to abstract mmap usages rather than read directly on the mmap slices
func (z *ZimReader) bytesRangeAt(start, end uint64) ([]byte, error) {
	buf := make([]byte, end-start)
	n, err := z.f.ReadAt(buf, int64(start))
	if err != nil {
		return nil, fmt.Errorf("can't read bytes  %w", err)
	}

	if n != int(end-start) {
		return nil, errors.New("can't read enough bytes")
	}

	return buf, nil
}

// populate the ZimReader structs with headers
func (z *ZimReader) readFileHeaders() error {
	// checking for file type
	v, err := readInt32(z.bytesRangeAt(0, 0+4))
	if err != nil || v != zimHeader {
		return errors.New("not a ZIM file")
	}

	// checking for version
	v, err = readInt32(z.bytesRangeAt(4, 4+4))
	if err != nil {
		return errors.Wrap(err, "could not read file version")
	}

	// checking for articles count
	v, err = readInt32(z.bytesRangeAt(8, 16))
	if err != nil {
		return err
	}
	z.UUID = v

	// checking for articles count
	v, err = readInt32(z.bytesRangeAt(24, 24+4))
	if err != nil {
		return err
	}
	z.ArticleCount = v

	// checking for cluster count
	v, err = readInt32(z.bytesRangeAt(28, 28+4))
	if err != nil {
		return err
	}
	z.clusterCount = v

	// checking for urlPtrPos
	vb, err := readInt64(z.bytesRangeAt(32, 32+8))
	if err != nil {
		return err
	}
	z.urlPtrPos = vb

	// checking for titlePtrPos
	vb, err = readInt64(z.bytesRangeAt(40, 40+8))
	if err != nil {
		return err
	}
	z.titlePtrPos = vb

	// checking for clusterPtrPos
	vb, err = readInt64(z.bytesRangeAt(48, 48+8))
	if err != nil {
		return err
	}
	z.clusterPtrPos = vb

	// checking for mimeListPos
	vb, err = readInt64(z.bytesRangeAt(56, 56+8))
	if err != nil {
		return err
	}
	z.mimeListPos = vb

	// checking for mainPage
	v, err = readInt32(z.bytesRangeAt(64, 64+4))
	if err != nil {
		return err
	}
	z.mainPage = v

	// checking for layoutPage
	v, err = readInt32(z.bytesRangeAt(68, 68+4))
	if err != nil {
		return err
	}
	z.layoutPage = v

	z.MimeTypes()
	return nil
}

// return start and end offsets for cluster at index idx
func (z *ZimReader) clusterOffsetsAtIdx(idx uint32) (start, end uint64, err error) {
	offset := z.clusterPtrPos + (uint64(idx) * 8)
	start, err = readInt64(z.bytesRangeAt(offset, offset+8))
	if err != nil {
		return
	}
	offset = z.clusterPtrPos + (uint64(idx+1) * 8)
	end, err = readInt64(z.bytesRangeAt(offset, offset+8))
	if err != nil {
		return
	}
	end--
	return
}
feat: basic zim support 2023-07-11 02:42:05 +02:00			`package zim`

			`import (`
			`"bytes"`
			`"fmt"`
			`"io"`
			`"os"`
			`"strings"`
			`"sync"`

			`lru "github.com/hashicorp/golang-lru/v2"`
			`"github.com/pkg/errors"`
			`)`

			`const (`
			`zimHeader = 72173914`
			`)`

			`// ZimReader keep tracks of everything related to ZIM reading`
			`type ZimReader struct {`
			`f *os.File`
			`UUID uint32`
			`ArticleCount uint32`
			`clusterCount uint32`
			`urlPtrPos uint64`
			`titlePtrPos uint64`
			`clusterPtrPos uint64`
			`mimeListPos uint64`
			`mainPage uint32`
			`layoutPage uint32`
			`mimeTypeList []string`
			`}`

			`// create a new zim reader`
			`func NewReader(path string) (*ZimReader, error) {`
			`f, err := os.Open(path)`
			`if err != nil {`
			`return nil, err`
			`}`
			`z := ZimReader{f: f, mainPage: 0xffffffff, layoutPage: 0xffffffff}`

			`articlePool = sync.Pool{`
			`New: func() interface{} {`
			`return new(Article)`
			`},`
			`}`
			`// keep 4 latest uncompressed blobs, around 1M per blob`
			`bcache, _ = lru.New[any, any](5)`

			`err = z.readFileHeaders()`
			`return &z, err`
			`}`

			`// Return an ordered list of mime types present in the ZIM file`
			`func (z *ZimReader) MimeTypes() []string {`
			`if len(z.mimeTypeList) != 0 {`
			`return z.mimeTypeList`
			`}`

			`var s []string`
			`// assume mime list fit in 2k`
			`b, err := z.bytesRangeAt(z.mimeListPos, z.mimeListPos+2048)`
			`if err != nil {`
			`return s`
			`}`
			`bbuf := bytes.NewBuffer(b)`

			`for {`
			`line, err := bbuf.ReadBytes('\x00')`
			`if err != nil && err != io.EOF {`
			`return s`
			`}`
			`// a line of 1 is a line containing only \x00 and it's the marker for the`
			`// end of mime types list`
			`if len(line) == 1 {`
			`break`
			`}`
			`s = append(s, strings.TrimRight(string(line), "\x00"))`
			`}`
			`z.mimeTypeList = s`
			`return s`
			`}`

			`// list all articles, using url index, contained in a zim file`
			`// note that this is a slow implementation, a real iterator is faster`
			`// you are not suppose to use this method on big zim files, use indexes`
			`func (z ZimReader) ListArticles() <-chan Article {`
			`ch := make(chan *Article, 10)`

			`go func() {`
			`var idx uint32`
			`// starting at 1 to avoid "con" entry`
			`var start uint32 = 1`

			`for idx = start; idx < z.ArticleCount; idx++ {`
			`art, err := z.ArticleAtURLIdx(idx)`
			`if err != nil {`
			`continue`
			`}`

			`if art == nil {`
			`// TODO: deal with redirect continue`
			`}`
			`ch <- art`
			`}`
			`close(ch)`
			`}()`
			`return ch`
			`}`

			`// list all title pointer, Titles by position contained in a zim file`
			`// Titles are pointers to URLpos index, useful for indexing cause smaller to store: uint32`
			`// note that this is a slow implementation, a real iterator is faster`
			`// you are not suppose to use this method on big zim files prefer ListTitlesPtrIterator to build your index`
			`func (z *ZimReader) ListTitlesPtr() <-chan uint32 {`
			`ch := make(chan uint32, 10)`

			`go func() {`
			`var pos uint64`
			`var count uint32`

			`for pos = z.titlePtrPos; count < z.ArticleCount; pos += 4 {`
			`idx, err := readInt32(z.bytesRangeAt(pos, pos+4))`
			`if err != nil {`
			`continue`
			`}`
			`ch <- idx`
			`count++`
			`}`
			`close(ch)`
			`}()`
			`return ch`
			`}`

			`// list all title pointer, Titles by position contained in a zim file`
			`// Titles are pointers to URLpos index, usefull for indexing cause smaller to store: uint32`
			`func (z *ZimReader) ListTitlesPtrIterator(cb func(uint32)) {`
			`var count uint32`
			`for pos := z.titlePtrPos; count < z.ArticleCount; pos += 4 {`
			`idx, err := readInt32(z.bytesRangeAt(pos, pos+4))`
			`if err != nil {`
			`continue`
			`}`
			`cb(idx)`
			`count++`
			`}`
			`}`

			`// return the article at the exact url not using any index`
			`func (z ZimReader) GetPageNoIndex(url string) (Article, error) {`
			`// starting at 1 to avoid "con" entry`
			`var start uint32`
			`stop := z.ArticleCount`

			`a := new(Article)`

			`for {`
			`pos := (start + stop) / 2`

			`offset, err := z.OffsetAtURLIdx(pos)`
			`if err != nil {`
			`return nil, err`
			`}`
			`err = z.FillArticleAt(a, offset)`
			`if err != nil {`
			`return nil, err`
			`}`

			`if a.FullURL() == url {`
			`return a, nil`
			`}`

			`if a.FullURL() > url {`
			`stop = pos`
			`} else {`
			`start = pos`
			`}`
			`if stop-start == 1 {`
			`break`
			`}`

			`}`
			`return nil, errors.WithStack(ErrNotFound)`
			`}`

			`// get the offset pointing to Article at pos in the URL idx`
			`func (z *ZimReader) OffsetAtURLIdx(idx uint32) (uint64, error) {`
			`offset := z.urlPtrPos + uint64(idx)*8`
			`return readInt64(z.bytesRangeAt(offset, offset+8))`
			`}`

			`// Close & cleanup the zimreader`
			`func (z *ZimReader) Close() error {`
			`return z.f.Close()`
			`}`

			`func (z *ZimReader) String() string {`
			`fi, err := z.f.Stat()`
			`if err != nil {`
			`return "corrupted zim"`
			`}`
			`return fmt.Sprintf("Size: %d, ArticleCount: %d urlPtrPos: 0x%x titlePtrPos: 0x%x mimeListPos: 0x%x clusterPtrPos: 0x%x\nMimeTypes: %v",`
			`fi.Size(), z.ArticleCount, z.urlPtrPos, z.titlePtrPos, z.mimeListPos, z.clusterPtrPos, z.MimeTypes())`
			`}`

			`// getBytesRangeAt returns bytes from start to end`
			`// it's needed to abstract mmap usages rather than read directly on the mmap slices`
			`func (z *ZimReader) bytesRangeAt(start, end uint64) ([]byte, error) {`
			`buf := make([]byte, end-start)`
			`n, err := z.f.ReadAt(buf, int64(start))`
			`if err != nil {`
			`return nil, fmt.Errorf("can't read bytes %w", err)`
			`}`

			`if n != int(end-start) {`
			`return nil, errors.New("can't read enough bytes")`
			`}`

			`return buf, nil`
			`}`

			`// populate the ZimReader structs with headers`
			`func (z *ZimReader) readFileHeaders() error {`
			`// checking for file type`
			`v, err := readInt32(z.bytesRangeAt(0, 0+4))`
			`if err != nil \|\| v != zimHeader {`
			`return errors.New("not a ZIM file")`
			`}`

			`// checking for version`
			`v, err = readInt32(z.bytesRangeAt(4, 4+4))`
			`if err != nil {`
			`return errors.Wrap(err, "could not read file version")`
			`}`

			`// checking for articles count`
			`v, err = readInt32(z.bytesRangeAt(8, 16))`
			`if err != nil {`
			`return err`
			`}`
			`z.UUID = v`

			`// checking for articles count`
			`v, err = readInt32(z.bytesRangeAt(24, 24+4))`
			`if err != nil {`
			`return err`
			`}`
			`z.ArticleCount = v`

			`// checking for cluster count`
			`v, err = readInt32(z.bytesRangeAt(28, 28+4))`
			`if err != nil {`
			`return err`
			`}`
			`z.clusterCount = v`

			`// checking for urlPtrPos`
			`vb, err := readInt64(z.bytesRangeAt(32, 32+8))`
			`if err != nil {`
			`return err`
			`}`
			`z.urlPtrPos = vb`

			`// checking for titlePtrPos`
			`vb, err = readInt64(z.bytesRangeAt(40, 40+8))`
			`if err != nil {`
			`return err`
			`}`
			`z.titlePtrPos = vb`

			`// checking for clusterPtrPos`
			`vb, err = readInt64(z.bytesRangeAt(48, 48+8))`
			`if err != nil {`
			`return err`
			`}`
			`z.clusterPtrPos = vb`

			`// checking for mimeListPos`
			`vb, err = readInt64(z.bytesRangeAt(56, 56+8))`
			`if err != nil {`
			`return err`
			`}`
			`z.mimeListPos = vb`

			`// checking for mainPage`
			`v, err = readInt32(z.bytesRangeAt(64, 64+4))`
			`if err != nil {`
			`return err`
			`}`
			`z.mainPage = v`

			`// checking for layoutPage`
			`v, err = readInt32(z.bytesRangeAt(68, 68+4))`
			`if err != nil {`
			`return err`
			`}`
			`z.layoutPage = v`

			`z.MimeTypes()`
			`return nil`
			`}`

			`// return start and end offsets for cluster at index idx`
			`func (z *ZimReader) clusterOffsetsAtIdx(idx uint32) (start, end uint64, err error) {`
			`offset := z.clusterPtrPos + (uint64(idx) * 8)`
			`start, err = readInt64(z.bytesRangeAt(offset, offset+8))`
			`if err != nil {`
			`return`
			`}`
			`offset = z.clusterPtrPos + (uint64(idx+1) * 8)`
			`end, err = readInt64(z.bytesRangeAt(offset, offset+8))`
			`if err != nil {`
			`return`
			`}`
			`end--`
			`return`
			`}`