edge/pkg/bundle/zim/reader.go

523 lines
10 KiB
Go

package zim
import (
"encoding/binary"
"fmt"
"io"
"os"
"strings"
lru "github.com/hashicorp/golang-lru/v2"
"github.com/pkg/errors"
)
const zimFormatMagicNumber uint32 = 0x44D495A
const nullByte = '\x00'
const zimRedirect = 0xffff
type Reader struct {
majorVersion uint16
minorVersion uint16
uuid string
entryCount uint32
clusterCount uint32
urlPtrPos uint64
titlePtrPos uint64
clusterPtrPos uint64
mimeListPos uint64
mainPage uint32
layoutPage uint32
checksumPos uint64
mimeTypes []string
urlIndex []uint64
urlCache *lru.Cache[string, uint64]
titleCache *lru.Cache[string, uint64]
seeker io.ReadSeekCloser
}
func (r *Reader) Version() (majorVersion, minorVersion uint16) {
return r.majorVersion, r.minorVersion
}
func (r *Reader) EntryCount() uint32 {
return r.entryCount
}
func (r *Reader) ClusterCount() uint32 {
return r.clusterCount
}
func (r *Reader) UUID() string {
return r.uuid
}
func (r *Reader) Entries() *EntryIterator {
return &EntryIterator{
reader: r,
}
}
func (r *Reader) EntryAt(idx int) (Entry, error) {
if idx >= len(r.urlIndex) || idx < 0 {
return nil, errors.Wrapf(ErrInvalidEntryIndex, "index '%d' out of bounds", idx)
}
entryPtr := r.urlIndex[idx]
entry, err := r.parseEntryAt(int64(entryPtr))
if err != nil {
return nil, errors.WithStack(err)
}
r.cacheEntry(entryPtr, entry)
return entry, nil
}
func (r *Reader) EntryWithURL(ns Namespace, url string) (Entry, error) {
offset, found := r.getEntryOffsetByURLFromCache(ns, url)
if found {
entry, err := r.parseEntryAt(int64(offset))
if err != nil {
return nil, errors.WithStack(err)
}
return entry, nil
}
iterator := r.Entries()
for iterator.Next() {
entry := iterator.Entry()
if entry.Namespace() == ns && entry.URL() == url {
return entry, nil
}
}
if err := iterator.Err(); err != nil {
return nil, errors.WithStack(err)
}
return nil, errors.WithStack(ErrNotFound)
}
func (r *Reader) EntryWithTitle(title string) (Entry, error) {
offset, found := r.getEntryOffsetByTitleFromCache(title)
if found {
entry, err := r.parseEntryAt(int64(offset))
if err != nil {
return nil, errors.WithStack(err)
}
return entry, nil
}
iterator := r.Entries()
for iterator.Next() {
entry := iterator.Entry()
if entry.Title() == title {
return entry, nil
}
}
if err := iterator.Err(); err != nil {
return nil, errors.WithStack(err)
}
return nil, errors.WithStack(ErrNotFound)
}
func (r *Reader) getURLCacheKey(entry Entry) string {
return fmt.Sprintf("%s/%s", entry.Namespace(), entry.URL())
}
func (r *Reader) cacheEntry(offset uint64, entry Entry) {
urlKey := r.getURLCacheKey(entry)
r.urlCache.Add(urlKey, offset)
r.titleCache.Add(entry.Title(), offset)
}
func (r *Reader) getEntryOffsetByURLFromCache(namespace Namespace, url string) (uint64, bool) {
key := fmt.Sprintf("%s/%s", namespace, url)
return r.urlCache.Get(key)
}
func (r *Reader) getEntryOffsetByTitleFromCache(title string) (uint64, bool) {
return r.titleCache.Get(title)
}
func (r *Reader) parse() error {
if err := r.parseHeader(); err != nil {
return errors.WithStack(err)
}
if err := r.parseMimeTypes(); err != nil {
return errors.WithStack(err)
}
if err := r.parseURLIndex(); err != nil {
return errors.WithStack(err)
}
return nil
}
func (r *Reader) parseHeader() error {
magicNumber, err := r.readUint32At(0)
if err != nil {
return errors.WithStack(err)
}
if magicNumber != zimFormatMagicNumber {
return errors.Errorf("invalid zim magic number '%d'", magicNumber)
}
majorVersion, err := r.readUint16At(4)
if err != nil {
return errors.WithStack(err)
}
r.majorVersion = majorVersion
minorVersion, err := r.readUint16At(6)
if err != nil {
return errors.WithStack(err)
}
r.minorVersion = minorVersion
if err := r.parseUUID(); err != nil {
return errors.WithStack(err)
}
entryCount, err := r.readUint32At(24)
if err != nil {
return errors.WithStack(err)
}
r.entryCount = entryCount
clusterCount, err := r.readUint32At(28)
if err != nil {
return errors.WithStack(err)
}
r.clusterCount = clusterCount
urlPtrPos, err := r.readUint64At(32)
if err != nil {
return errors.WithStack(err)
}
r.urlPtrPos = urlPtrPos
titlePtrPos, err := r.readUint64At(40)
if err != nil {
return errors.WithStack(err)
}
r.titlePtrPos = titlePtrPos
clusterPtrPos, err := r.readUint64At(48)
if err != nil {
return errors.WithStack(err)
}
r.clusterPtrPos = clusterPtrPos
mimeListPos, err := r.readUint64At(56)
if err != nil {
return errors.WithStack(err)
}
r.mimeListPos = mimeListPos
mainPage, err := r.readUint32At(64)
if err != nil {
return errors.WithStack(err)
}
r.mainPage = mainPage
layoutPage, err := r.readUint32At(68)
if err != nil {
return errors.WithStack(err)
}
r.layoutPage = layoutPage
checksumPos, err := r.readUint64At(72)
if err != nil {
return errors.WithStack(err)
}
r.checksumPos = checksumPos
return nil
}
func (r *Reader) parseUUID() error {
data := make([]byte, 16)
if err := r.readRange(8, data); err != nil {
return errors.WithStack(err)
}
parts := make([]string, 0, 5)
val32, err := readUint32(data[0:4], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%08x", val32))
val16, err := readUint16(data[4:6], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%04x", val16))
val16, err = readUint16(data[6:8], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%04x", val16))
val16, err = readUint16(data[8:10], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%04x", val16))
val32, err = readUint32(data[10:14], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
val16, err = readUint16(data[14:16], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%x%x", val32, val16))
r.uuid = strings.Join(parts, "-")
return nil
}
func (r *Reader) parseMimeTypes() error {
mimeTypes := make([]string, 0)
offset := int64(r.mimeListPos)
for {
mimeType, read, err := r.readStringAt(offset)
if err != nil {
return errors.WithStack(err)
}
if mimeType == "" {
break
}
mimeTypes = append(mimeTypes, mimeType)
offset += read + 1
}
r.mimeTypes = mimeTypes
return nil
}
func (r *Reader) parseURLIndex() error {
urlIndex, err := r.parseEntryIndex(int64(r.urlPtrPos))
if err != nil {
return errors.WithStack(err)
}
r.urlIndex = urlIndex
return nil
}
func (r *Reader) parseEntryAt(offset int64) (Entry, error) {
base, err := r.parseBaseEntry(offset)
if err != nil {
return nil, errors.WithStack(err)
}
var entry Entry
if base.mimeTypeIndex == zimRedirect {
entry, err = r.parseRedirectEntry(offset, base)
if err != nil {
return nil, errors.WithStack(err)
}
} else {
entry, err = r.parseContentEntry(offset, base)
if err != nil {
return nil, errors.WithStack(err)
}
}
return entry, nil
}
func (r *Reader) parseEntryIndex(startAddr int64) ([]uint64, error) {
index := make([]uint64, r.entryCount)
data := make([]byte, 8)
for i := int64(0); i < int64(r.entryCount); i++ {
if err := r.readRange(startAddr+i*8, data); err != nil {
return nil, errors.WithStack(err)
}
ptr, err := readUint64(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
index[i] = ptr
}
return index, nil
}
func (r *Reader) readRange(offset int64, v []byte) error {
if _, err := r.seeker.Seek(offset, io.SeekStart); err != nil {
return errors.WithStack(err)
}
read, err := r.seeker.Read(v)
if err != nil {
return errors.WithStack(err)
}
if read != len(v) {
return errors.New("could not read enough bytes")
}
return nil
}
func (r *Reader) readUint32At(offset int64) (uint32, error) {
data := make([]byte, 4)
if err := r.readRange(offset, data); err != nil {
return 0, errors.WithStack(err)
}
value, err := readUint32(data, binary.LittleEndian)
if err != nil {
return 0, errors.WithStack(err)
}
return value, nil
}
func (r *Reader) readUint16At(offset int64) (uint16, error) {
data := make([]byte, 2)
if err := r.readRange(offset, data); err != nil {
return 0, errors.WithStack(err)
}
value, err := readUint16(data, binary.LittleEndian)
if err != nil {
return 0, errors.WithStack(err)
}
return value, nil
}
func (r *Reader) readUint64At(offset int64) (uint64, error) {
data := make([]byte, 8)
if err := r.readRange(offset, data); err != nil {
return 0, errors.WithStack(err)
}
value, err := readUint64(data, binary.LittleEndian)
if err != nil {
return 0, errors.WithStack(err)
}
return value, nil
}
func (r *Reader) readStringAt(offset int64) (string, int64, error) {
data := make([]byte, 1)
var sb strings.Builder
read := int64(0)
for {
if err := r.readRange(offset+read, data); err != nil {
return "", read, errors.WithStack(err)
}
if err := sb.WriteByte(data[0]); err != nil {
return "", read, errors.WithStack(err)
}
if data[0] == nullByte {
str := strings.TrimRight(sb.String(), "\x00")
return str, read, nil
}
read++
}
}
func (r *Reader) Close() error {
if err := r.seeker.Close(); err != nil {
return errors.WithStack(err)
}
return nil
}
func NewReader(seeker io.ReadSeekCloser, funcs ...OptionFunc) (*Reader, error) {
opts := NewOptions(funcs...)
urlCache, err := lru.New[string, uint64](opts.URLCacheSize)
if err != nil {
return nil, errors.WithStack(err)
}
titleCache, err := lru.New[string, uint64](opts.TitleCacheSize)
if err != nil {
return nil, errors.WithStack(err)
}
reader := &Reader{
seeker: seeker,
urlCache: urlCache,
titleCache: titleCache,
}
if err := reader.parse(); err != nil {
return nil, errors.WithStack(err)
}
return reader, nil
}
func Open(path string, funcs ...OptionFunc) (*Reader, error) {
file, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm)
if err != nil {
return nil, errors.WithStack(err)
}
reader, err := NewReader(file, funcs...)
if err != nil {
return nil, errors.WithStack(err)
}
return reader, nil
}