523 lines
10 KiB
Go
523 lines
10 KiB
Go
|
package zim
|
||
|
|
||
|
import (
|
||
|
"encoding/binary"
|
||
|
"fmt"
|
||
|
"io"
|
||
|
"os"
|
||
|
"strings"
|
||
|
|
||
|
lru "github.com/hashicorp/golang-lru/v2"
|
||
|
"github.com/pkg/errors"
|
||
|
)
|
||
|
|
||
|
const zimFormatMagicNumber uint32 = 0x44D495A
|
||
|
const nullByte = '\x00'
|
||
|
const zimRedirect = 0xffff
|
||
|
|
||
|
type Reader struct {
|
||
|
majorVersion uint16
|
||
|
minorVersion uint16
|
||
|
uuid string
|
||
|
entryCount uint32
|
||
|
clusterCount uint32
|
||
|
urlPtrPos uint64
|
||
|
titlePtrPos uint64
|
||
|
clusterPtrPos uint64
|
||
|
mimeListPos uint64
|
||
|
mainPage uint32
|
||
|
layoutPage uint32
|
||
|
checksumPos uint64
|
||
|
|
||
|
mimeTypes []string
|
||
|
|
||
|
urlIndex []uint64
|
||
|
|
||
|
urlCache *lru.Cache[string, uint64]
|
||
|
titleCache *lru.Cache[string, uint64]
|
||
|
|
||
|
seeker io.ReadSeekCloser
|
||
|
}
|
||
|
|
||
|
func (r *Reader) Version() (majorVersion, minorVersion uint16) {
|
||
|
return r.majorVersion, r.minorVersion
|
||
|
}
|
||
|
|
||
|
func (r *Reader) EntryCount() uint32 {
|
||
|
return r.entryCount
|
||
|
}
|
||
|
|
||
|
func (r *Reader) ClusterCount() uint32 {
|
||
|
return r.clusterCount
|
||
|
}
|
||
|
|
||
|
func (r *Reader) UUID() string {
|
||
|
return r.uuid
|
||
|
}
|
||
|
|
||
|
func (r *Reader) Entries() *EntryIterator {
|
||
|
return &EntryIterator{
|
||
|
reader: r,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (r *Reader) EntryAt(idx int) (Entry, error) {
|
||
|
if idx >= len(r.urlIndex) || idx < 0 {
|
||
|
return nil, errors.Wrapf(ErrInvalidEntryIndex, "index '%d' out of bounds", idx)
|
||
|
}
|
||
|
|
||
|
entryPtr := r.urlIndex[idx]
|
||
|
|
||
|
entry, err := r.parseEntryAt(int64(entryPtr))
|
||
|
if err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.cacheEntry(entryPtr, entry)
|
||
|
|
||
|
return entry, nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) EntryWithURL(ns Namespace, url string) (Entry, error) {
|
||
|
offset, found := r.getEntryOffsetByURLFromCache(ns, url)
|
||
|
if found {
|
||
|
entry, err := r.parseEntryAt(int64(offset))
|
||
|
if err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
return entry, nil
|
||
|
}
|
||
|
|
||
|
iterator := r.Entries()
|
||
|
|
||
|
for iterator.Next() {
|
||
|
entry := iterator.Entry()
|
||
|
|
||
|
if entry.Namespace() == ns && entry.URL() == url {
|
||
|
return entry, nil
|
||
|
}
|
||
|
}
|
||
|
if err := iterator.Err(); err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
return nil, errors.WithStack(ErrNotFound)
|
||
|
}
|
||
|
|
||
|
func (r *Reader) EntryWithTitle(title string) (Entry, error) {
|
||
|
offset, found := r.getEntryOffsetByTitleFromCache(title)
|
||
|
if found {
|
||
|
entry, err := r.parseEntryAt(int64(offset))
|
||
|
if err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
return entry, nil
|
||
|
}
|
||
|
|
||
|
iterator := r.Entries()
|
||
|
|
||
|
for iterator.Next() {
|
||
|
entry := iterator.Entry()
|
||
|
|
||
|
if entry.Title() == title {
|
||
|
return entry, nil
|
||
|
}
|
||
|
}
|
||
|
if err := iterator.Err(); err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
return nil, errors.WithStack(ErrNotFound)
|
||
|
}
|
||
|
|
||
|
func (r *Reader) getURLCacheKey(entry Entry) string {
|
||
|
return fmt.Sprintf("%s/%s", entry.Namespace(), entry.URL())
|
||
|
}
|
||
|
|
||
|
func (r *Reader) cacheEntry(offset uint64, entry Entry) {
|
||
|
urlKey := r.getURLCacheKey(entry)
|
||
|
r.urlCache.Add(urlKey, offset)
|
||
|
r.titleCache.Add(entry.Title(), offset)
|
||
|
}
|
||
|
|
||
|
func (r *Reader) getEntryOffsetByURLFromCache(namespace Namespace, url string) (uint64, bool) {
|
||
|
key := fmt.Sprintf("%s/%s", namespace, url)
|
||
|
return r.urlCache.Get(key)
|
||
|
}
|
||
|
|
||
|
func (r *Reader) getEntryOffsetByTitleFromCache(title string) (uint64, bool) {
|
||
|
return r.titleCache.Get(title)
|
||
|
}
|
||
|
|
||
|
func (r *Reader) parse() error {
|
||
|
if err := r.parseHeader(); err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
if err := r.parseMimeTypes(); err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
if err := r.parseURLIndex(); err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) parseHeader() error {
|
||
|
magicNumber, err := r.readUint32At(0)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
if magicNumber != zimFormatMagicNumber {
|
||
|
return errors.Errorf("invalid zim magic number '%d'", magicNumber)
|
||
|
}
|
||
|
|
||
|
majorVersion, err := r.readUint16At(4)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.majorVersion = majorVersion
|
||
|
|
||
|
minorVersion, err := r.readUint16At(6)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.minorVersion = minorVersion
|
||
|
|
||
|
if err := r.parseUUID(); err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
entryCount, err := r.readUint32At(24)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.entryCount = entryCount
|
||
|
|
||
|
clusterCount, err := r.readUint32At(28)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.clusterCount = clusterCount
|
||
|
|
||
|
urlPtrPos, err := r.readUint64At(32)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.urlPtrPos = urlPtrPos
|
||
|
|
||
|
titlePtrPos, err := r.readUint64At(40)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.titlePtrPos = titlePtrPos
|
||
|
|
||
|
clusterPtrPos, err := r.readUint64At(48)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.clusterPtrPos = clusterPtrPos
|
||
|
|
||
|
mimeListPos, err := r.readUint64At(56)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.mimeListPos = mimeListPos
|
||
|
|
||
|
mainPage, err := r.readUint32At(64)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.mainPage = mainPage
|
||
|
|
||
|
layoutPage, err := r.readUint32At(68)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.layoutPage = layoutPage
|
||
|
|
||
|
checksumPos, err := r.readUint64At(72)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.checksumPos = checksumPos
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) parseUUID() error {
|
||
|
data := make([]byte, 16)
|
||
|
if err := r.readRange(8, data); err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
parts := make([]string, 0, 5)
|
||
|
|
||
|
val32, err := readUint32(data[0:4], binary.BigEndian)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
parts = append(parts, fmt.Sprintf("%08x", val32))
|
||
|
|
||
|
val16, err := readUint16(data[4:6], binary.BigEndian)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
parts = append(parts, fmt.Sprintf("%04x", val16))
|
||
|
|
||
|
val16, err = readUint16(data[6:8], binary.BigEndian)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
parts = append(parts, fmt.Sprintf("%04x", val16))
|
||
|
|
||
|
val16, err = readUint16(data[8:10], binary.BigEndian)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
parts = append(parts, fmt.Sprintf("%04x", val16))
|
||
|
|
||
|
val32, err = readUint32(data[10:14], binary.BigEndian)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
val16, err = readUint16(data[14:16], binary.BigEndian)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
parts = append(parts, fmt.Sprintf("%x%x", val32, val16))
|
||
|
|
||
|
r.uuid = strings.Join(parts, "-")
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) parseMimeTypes() error {
|
||
|
mimeTypes := make([]string, 0)
|
||
|
|
||
|
offset := int64(r.mimeListPos)
|
||
|
for {
|
||
|
mimeType, read, err := r.readStringAt(offset)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
if mimeType == "" {
|
||
|
break
|
||
|
}
|
||
|
|
||
|
mimeTypes = append(mimeTypes, mimeType)
|
||
|
|
||
|
offset += read + 1
|
||
|
}
|
||
|
|
||
|
r.mimeTypes = mimeTypes
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) parseURLIndex() error {
|
||
|
urlIndex, err := r.parseEntryIndex(int64(r.urlPtrPos))
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
r.urlIndex = urlIndex
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) parseEntryAt(offset int64) (Entry, error) {
|
||
|
base, err := r.parseBaseEntry(offset)
|
||
|
if err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
var entry Entry
|
||
|
|
||
|
if base.mimeTypeIndex == zimRedirect {
|
||
|
entry, err = r.parseRedirectEntry(offset, base)
|
||
|
if err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
} else {
|
||
|
entry, err = r.parseContentEntry(offset, base)
|
||
|
if err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return entry, nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) parseEntryIndex(startAddr int64) ([]uint64, error) {
|
||
|
index := make([]uint64, r.entryCount)
|
||
|
|
||
|
data := make([]byte, 8)
|
||
|
for i := int64(0); i < int64(r.entryCount); i++ {
|
||
|
if err := r.readRange(startAddr+i*8, data); err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
ptr, err := readUint64(data, binary.LittleEndian)
|
||
|
if err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
index[i] = ptr
|
||
|
}
|
||
|
|
||
|
return index, nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) readRange(offset int64, v []byte) error {
|
||
|
if _, err := r.seeker.Seek(offset, io.SeekStart); err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
read, err := r.seeker.Read(v)
|
||
|
if err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
if read != len(v) {
|
||
|
return errors.New("could not read enough bytes")
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) readUint32At(offset int64) (uint32, error) {
|
||
|
data := make([]byte, 4)
|
||
|
if err := r.readRange(offset, data); err != nil {
|
||
|
return 0, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
value, err := readUint32(data, binary.LittleEndian)
|
||
|
if err != nil {
|
||
|
return 0, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
return value, nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) readUint16At(offset int64) (uint16, error) {
|
||
|
data := make([]byte, 2)
|
||
|
if err := r.readRange(offset, data); err != nil {
|
||
|
return 0, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
value, err := readUint16(data, binary.LittleEndian)
|
||
|
if err != nil {
|
||
|
return 0, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
return value, nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) readUint64At(offset int64) (uint64, error) {
|
||
|
data := make([]byte, 8)
|
||
|
if err := r.readRange(offset, data); err != nil {
|
||
|
return 0, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
value, err := readUint64(data, binary.LittleEndian)
|
||
|
if err != nil {
|
||
|
return 0, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
return value, nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) readStringAt(offset int64) (string, int64, error) {
|
||
|
data := make([]byte, 1)
|
||
|
var sb strings.Builder
|
||
|
read := int64(0)
|
||
|
for {
|
||
|
if err := r.readRange(offset+read, data); err != nil {
|
||
|
return "", read, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
if err := sb.WriteByte(data[0]); err != nil {
|
||
|
return "", read, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
if data[0] == nullByte {
|
||
|
str := strings.TrimRight(sb.String(), "\x00")
|
||
|
return str, read, nil
|
||
|
}
|
||
|
|
||
|
read++
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (r *Reader) Close() error {
|
||
|
if err := r.seeker.Close(); err != nil {
|
||
|
return errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func NewReader(seeker io.ReadSeekCloser, funcs ...OptionFunc) (*Reader, error) {
|
||
|
opts := NewOptions(funcs...)
|
||
|
|
||
|
urlCache, err := lru.New[string, uint64](opts.URLCacheSize)
|
||
|
if err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
titleCache, err := lru.New[string, uint64](opts.TitleCacheSize)
|
||
|
if err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
reader := &Reader{
|
||
|
seeker: seeker,
|
||
|
urlCache: urlCache,
|
||
|
titleCache: titleCache,
|
||
|
}
|
||
|
|
||
|
if err := reader.parse(); err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
return reader, nil
|
||
|
}
|
||
|
|
||
|
func Open(path string, funcs ...OptionFunc) (*Reader, error) {
|
||
|
file, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm)
|
||
|
if err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
reader, err := NewReader(file, funcs...)
|
||
|
if err != nil {
|
||
|
return nil, errors.WithStack(err)
|
||
|
}
|
||
|
|
||
|
return reader, nil
|
||
|
}
|