wip: zim lib rewrite
arcad/edge/pipeline/head There was a failure building this commit Details

This commit is contained in:
wpetit 2023-10-11 11:18:32 +02:00
parent 8facff2bd2
commit 7a703f30cc
20 changed files with 1295 additions and 152 deletions

2
go.mod
View File

@ -1,6 +1,6 @@
module forge.cadoles.com/arcad/edge module forge.cadoles.com/arcad/edge
go 1.20 go 1.21
require ( require (
github.com/hashicorp/golang-lru/v2 v2.0.7 github.com/hashicorp/golang-lru/v2 v2.0.7

View File

@ -0,0 +1,7 @@
package zim
import "errors"
var (
ErrNotFound = errors.New("not found")
)

View File

@ -8,6 +8,7 @@ import (
"strings" "strings"
"sync" "sync"
"github.com/davecgh/go-spew/spew"
lru "github.com/hashicorp/golang-lru/v2" lru "github.com/hashicorp/golang-lru/v2"
"github.com/pkg/errors" "github.com/pkg/errors"
) )
@ -100,6 +101,7 @@ func (z *ZimReader) ListArticles() <-chan *Article {
if art == nil { if art == nil {
// TODO: deal with redirect continue // TODO: deal with redirect continue
continue
} }
ch <- art ch <- art
} }
@ -296,6 +298,8 @@ func (z *ZimReader) readFileHeaders() error {
} }
z.layoutPage = v z.layoutPage = v
spew.Dump(z)
z.MimeTypes() z.MimeTypes()
return nil return nil
} }

View File

@ -0,0 +1,153 @@
package zim
import (
"path/filepath"
"reflect"
"runtime"
"testing"
"github.com/pkg/errors"
)
var testCases = []func(t *testing.T, z *ZimReader){
testOpen,
testData,
testDisplayArticle,
testDisplayInfost,
testFavicon,
testListArticles,
testMainPage,
testMetadata,
testMime,
testURLAtIdx,
}
func TestZim(t *testing.T) {
zimFiles, err := filepath.Glob("testdata/*.zim")
if err != nil {
t.Fatalf("%+v", errors.WithStack(err))
}
for _, zf := range zimFiles {
zr, err := NewReader(zf)
if err != nil {
t.Fatalf("%+v", errors.WithStack(err))
}
base := filepath.Base(zf)
t.Run(base, func(t *testing.T) {
for _, fn := range testCases {
testName := runtime.FuncForPC(reflect.ValueOf(fn).Pointer()).Name()
t.Run(testName, func(t *testing.T) {
fn(t, zr)
})
}
})
}
}
func testOpen(t *testing.T, zr *ZimReader) {
if zr.ArticleCount == 0 {
t.Errorf("No article found")
}
}
func testMime(t *testing.T, zr *ZimReader) {
if len(zr.MimeTypes()) == 0 {
t.Errorf("No mime types found")
}
}
func testDisplayInfost(t *testing.T, zr *ZimReader) {
info := zr.String()
if len(info) < 0 {
t.Errorf("Can't read infos")
}
t.Log(info)
}
func testURLAtIdx(t *testing.T, zr *ZimReader) {
// addr 0 is a redirect
p, _ := zr.OffsetAtURLIdx(5)
a, _ := zr.ArticleAt(p)
if a == nil {
t.Errorf("Can't find 1st url")
}
}
func testDisplayArticle(t *testing.T, zr *ZimReader) {
// addr 0 is a redirect
p, _ := zr.OffsetAtURLIdx(5)
a, _ := zr.ArticleAt(p)
if a == nil {
t.Errorf("Can't find 1st url")
}
t.Log(a)
}
func testListArticles(t *testing.T, zr *ZimReader) {
if testing.Short() {
t.Skip("skipping test in short mode.")
}
var i uint32
for a := range zr.ListArticles() {
i++
t.Log(a.String())
}
if i == 0 {
t.Errorf("Can't find any urls")
}
if i != zr.ArticleCount-1 {
t.Errorf("Can't find the exact ArticleCount urls %d vs %d", i, zr.ArticleCount)
}
}
func testMainPage(t *testing.T, zr *ZimReader) {
a, _ := zr.MainPage()
if a == nil {
t.Errorf("Can't find the mainpage article")
}
t.Log(a)
}
func testFavicon(t *testing.T, zr *ZimReader) {
favicon, err := zr.Favicon()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
if favicon == nil {
t.Errorf("Can't find the favicon article")
}
}
func testMetadata(t *testing.T, zr *ZimReader) {
metadata, err := zr.Metadata()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
if metadata == nil {
t.Errorf("Can't find the metadata")
}
}
func testData(t *testing.T, zr *ZimReader) {
// addr 0 is a redirect
p, _ := zr.OffsetAtURLIdx(2)
a, _ := zr.ArticleAt(p)
b, _ := a.Data()
data := string(b)
if a.EntryType != RedirectEntry {
if len(data) == 0 {
t.Error("can't read data")
}
}
t.Log(a.String())
t.Log(data)
}

View File

@ -0,0 +1,233 @@
package zim
import (
"bytes"
"encoding/binary"
"io"
"log"
"github.com/pkg/errors"
)
type zimCompression uint8
const (
zimCompressionNoneZeno zimCompression = 0
zimCompressionNone zimCompression = 1
zimCompressionNoneZLib zimCompression = 2
zimCompressionNoneBZip2 zimCompression = 3
zimCompressionNoneXZ zimCompression = 4
zimCompressionNoneZStandard zimCompression = 5
)
type ContentEntry struct {
*BaseEntry
mimeType string
clusterIndex uint32
blobIndex uint32
}
func (e *ContentEntry) Reader() (io.Reader, error) {
data := make([]byte, 8)
startClusterPtrOffset := e.reader.clusterPtrPos + (uint64(e.clusterIndex) * 8)
if err := e.reader.readRange(int64(startClusterPtrOffset), data); err != nil {
return nil, errors.WithStack(err)
}
startClusterOffset, err := readUint64(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
endClusterPtrOffset := e.reader.clusterPtrPos + (uint64(e.clusterIndex+1) * 8)
if err := e.reader.readRange(int64(endClusterPtrOffset), data); err != nil {
return nil, errors.WithStack(err)
}
endClusterOffset, err := readUint64(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
data = make([]byte, 1)
if err := e.reader.readRange(int64(startClusterPtrOffset), data); err != nil {
return nil, errors.WithStack(err)
}
clusterHeader := uint8(data[0])
compression := (clusterHeader << 4) >> 4
extended := (clusterHeader<<3)>>7 == 1
log.Printf("%08b %v %04b %d %d %d", clusterHeader, extended, compression, compression, startClusterOffset, endClusterOffset)
switch compression {
case uint8(zimCompressionNoneZeno):
fallthrough
case uint8(zimCompressionNone):
case uint8(zimCompressionNoneXZ):
case uint8(zimCompressionNoneZStandard):
case uint8(zimCompressionNoneZLib):
fallthrough
case uint8(zimCompressionNoneBZip2):
fallthrough
default:
// return nil, errors.Wrapf(ErrCompressionAlgorithmNotSupported, "unexpected compression algorithm '%d'", compression)
}
var internal []byte
buff := bytes.NewBuffer(internal)
// blob starts at offset, blob ends at offset
// var bs, be uint32
// // LZMA: 4, Zstandard: 5
// if compression == 4 || compression == 5 {
// var blob []byte
// var ok bool
// var dec io.ReadCloser
// if blob, ok = blobLookup(); !ok {
// b, err := a.z.bytesRangeAt(start+1, end+1)
// if err != nil {
// return nil, err
// }
// bbuf := bytes.NewBuffer(b)
// switch compression {
// case 5:
// dec, err = NewZstdReader(bbuf)
// case 4:
// dec, err = NewXZReader(bbuf)
// }
// if err != nil {
// return nil, err
// }
// defer dec.Close()
// // the decoded chunk are around 1MB
// b, err = ioutil.ReadAll(dec)
// if err != nil {
// return nil, err
// }
// blob = make([]byte, len(b))
// copy(blob, b)
// // TODO: 2 requests for the same blob could occure at the same time
// bcache.Add(a.cluster, blob)
// } else {
// bi, ok := bcache.Get(a.cluster)
// if !ok {
// return nil, errors.New("not in cache anymore")
// }
// blob = bi.([]byte)
// }
// bs, err = readInt32(blob[a.blob*4:a.blob*4+4], nil)
// if err != nil {
// return nil, err
// }
// be, err = readInt32(blob[a.blob*4+4:a.blob*4+4+4], nil)
// if err != nil {
// return nil, err
// }
// // avoid retaining all the chunk
// c := make([]byte, be-bs)
// copy(c, blob[bs:be])
// return c, nil
// } else if compression == 0 || compression == 1 {
// // uncompresssed
// startPos := start + 1
// blobOffset := uint64(a.blob * 4)
// bs, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset, startPos+blobOffset+4))
// if err != nil {
// return nil, err
// }
// be, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset+4, startPos+blobOffset+4+4))
// if err != nil {
// return nil, err
// }
// return a.z.bytesRangeAt(startPos+uint64(bs), startPos+uint64(be))
// }
return buff, nil
}
func (e *ContentEntry) Redirect() (*ContentEntry, error) {
return e, nil
}
func (r *Reader) parseContentEntry(offset int64, base *BaseEntry) (*ContentEntry, error) {
entry := &ContentEntry{
BaseEntry: base,
}
data := make([]byte, 2)
if err := r.readRange(offset, data); err != nil {
return nil, errors.WithStack(err)
}
mimeTypeIndex, err := readUint16(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
if mimeTypeIndex >= uint16(len(r.mimeTypes)) {
return nil, errors.Errorf("mime type index '%d' greater than mime types length '%d'", mimeTypeIndex, len(r.mimeTypes))
}
entry.mimeType = r.mimeTypes[mimeTypeIndex]
data = make([]byte, 1)
if err := r.readRange(offset+3, data); err != nil {
return nil, errors.WithStack(err)
}
entry.namespace = Namespace(data[0])
data = make([]byte, 4)
if err := r.readRange(offset+8, data); err != nil {
return nil, errors.WithStack(err)
}
clusterIndex, err := readUint32(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
entry.clusterIndex = clusterIndex
if err := r.readRange(offset+12, data); err != nil {
return nil, errors.WithStack(err)
}
blobIndex, err := readUint32(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
entry.blobIndex = blobIndex
url, read, err := r.readStringAt(offset + 16)
if err != nil {
return nil, errors.WithStack(err)
}
entry.url = url
title, _, err := r.readStringAt(offset + 16 + read)
if err != nil {
return nil, errors.WithStack(err)
}
entry.title = title
return entry, nil
}

150
pkg/bundle/zim/entry.go Normal file
View File

@ -0,0 +1,150 @@
package zim
import (
"encoding/binary"
"github.com/pkg/errors"
)
type Namespace string
const (
V6NamespaceContent = "C"
V6NamespaceMetadata = "M"
V6NamespaceWellKnown = "W"
V6NamespaceSearch = "X"
)
const (
V5NamespaceLayout = "-"
V5NamespaceArticle = "A"
V5NamespaceArticleMetadata = "B"
V5NamespaceImageFile = "I"
V5NamespaceImageText = "J"
V5NamespaceMetadata = "M"
V5NamespaceCategoryText = "U"
V5NamespaceCategoryArticleList = "V"
V5NamespaceCategoryPerArticle = "W"
V5NamespaceSearch = "X"
)
type Entry interface {
Redirect() (*ContentEntry, error)
Namespace() Namespace
URL() string
Title() string
}
type BaseEntry struct {
mimeTypeIndex uint16
namespace Namespace
url string
title string
reader *Reader
}
func (e *BaseEntry) Namespace() Namespace {
return e.namespace
}
func (e *BaseEntry) Title() string {
if e.title == "" {
return e.url
}
return e.title
}
func (e *BaseEntry) URL() string {
return e.url
}
func (r *Reader) parseBaseEntry(offset int64) (*BaseEntry, error) {
entry := &BaseEntry{
reader: r,
}
data := make([]byte, 2)
if err := r.readRange(offset, data); err != nil {
return nil, errors.WithStack(err)
}
mimeTypeIndex, err := readUint16(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
entry.mimeTypeIndex = mimeTypeIndex
data = make([]byte, 1)
if err := r.readRange(offset+3, data); err != nil {
return nil, errors.WithStack(err)
}
entry.namespace = Namespace(data[0])
return entry, nil
}
type RedirectEntry struct {
*BaseEntry
redirectIndex uint32
}
func (e *RedirectEntry) Redirect() (*ContentEntry, error) {
if e.redirectIndex >= uint32(len(e.reader.urlIndex)) {
return nil, errors.Wrapf(ErrInvalidEntryIndex, "entry index '%d' out of bounds", e.redirectIndex)
}
entryPtr := e.reader.urlIndex[e.redirectIndex]
entry, err := e.reader.parseEntryAt(int64(entryPtr))
if err != nil {
return nil, errors.WithStack(err)
}
entry, err = entry.Redirect()
if err != nil {
return nil, errors.WithStack(err)
}
contentEntry, ok := entry.(*ContentEntry)
if !ok {
return nil, errors.WithStack(ErrInvalidRedirect)
}
return contentEntry, nil
}
func (r *Reader) parseRedirectEntry(offset int64, base *BaseEntry) (*RedirectEntry, error) {
entry := &RedirectEntry{
BaseEntry: base,
}
data := make([]byte, 4)
if err := r.readRange(offset+8, data); err != nil {
return nil, errors.WithStack(err)
}
redirectIndex, err := readUint32(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
entry.redirectIndex = redirectIndex
url, read, err := r.readStringAt(offset + 12)
if err != nil {
return nil, errors.WithStack(err)
}
entry.url = url
title, _, err := r.readStringAt(offset + 12 + read)
if err != nil {
return nil, errors.WithStack(err)
}
entry.title = title
return entry, nil
}

View File

@ -0,0 +1,46 @@
package zim
import "github.com/pkg/errors"
type EntryIterator struct {
index int
entry Entry
err error
reader *Reader
}
func (it *EntryIterator) Next() bool {
if it.err != nil {
return false
}
entryCount := it.reader.EntryCount()
if it.index >= int(entryCount-1) {
return false
}
entry, err := it.reader.EntryAt(it.index)
if err != nil {
it.err = errors.WithStack(err)
return false
}
it.entry = entry
it.index++
return true
}
func (it *EntryIterator) Err() error {
return it.err
}
func (it *EntryIterator) Index() int {
return it.index
}
func (it *EntryIterator) Entry() Entry {
return it.entry
}

View File

@ -2,4 +2,9 @@ package zim
import "errors" import "errors"
var ErrNotFound = errors.New("not found") var (
ErrInvalidEntryIndex = errors.New("invalid entry index")
ErrNotFound = errors.New("not found")
ErrInvalidRedirect = errors.New("invalid redirect")
ErrCompressionAlgorithmNotSupported = errors.New("compression algorithm not supported")
)

38
pkg/bundle/zim/option.go Normal file
View File

@ -0,0 +1,38 @@
package zim
import "time"
type Options struct {
URLCacheSize int
URLCacheTTL time.Duration
TitleCacheSize int
TitleCacheTTL time.Duration
}
type OptionFunc func(opts *Options)
func NewOptions(funcs ...OptionFunc) *Options {
funcs = append([]OptionFunc{
WithURLCacheSize(64),
WithTitleCacheSize(64),
}, funcs...)
opts := &Options{}
for _, fn := range funcs {
fn(opts)
}
return opts
}
func WithURLCacheSize(size int) OptionFunc {
return func(opts *Options) {
opts.URLCacheSize = size
}
}
func WithTitleCacheSize(size int) OptionFunc {
return func(opts *Options) {
opts.TitleCacheSize = size
}
}

522
pkg/bundle/zim/reader.go Normal file
View File

@ -0,0 +1,522 @@
package zim
import (
"encoding/binary"
"fmt"
"io"
"os"
"strings"
lru "github.com/hashicorp/golang-lru/v2"
"github.com/pkg/errors"
)
const zimFormatMagicNumber uint32 = 0x44D495A
const nullByte = '\x00'
const zimRedirect = 0xffff
type Reader struct {
majorVersion uint16
minorVersion uint16
uuid string
entryCount uint32
clusterCount uint32
urlPtrPos uint64
titlePtrPos uint64
clusterPtrPos uint64
mimeListPos uint64
mainPage uint32
layoutPage uint32
checksumPos uint64
mimeTypes []string
urlIndex []uint64
urlCache *lru.Cache[string, uint64]
titleCache *lru.Cache[string, uint64]
seeker io.ReadSeekCloser
}
func (r *Reader) Version() (majorVersion, minorVersion uint16) {
return r.majorVersion, r.minorVersion
}
func (r *Reader) EntryCount() uint32 {
return r.entryCount
}
func (r *Reader) ClusterCount() uint32 {
return r.clusterCount
}
func (r *Reader) UUID() string {
return r.uuid
}
func (r *Reader) Entries() *EntryIterator {
return &EntryIterator{
reader: r,
}
}
func (r *Reader) EntryAt(idx int) (Entry, error) {
if idx >= len(r.urlIndex) || idx < 0 {
return nil, errors.Wrapf(ErrInvalidEntryIndex, "index '%d' out of bounds", idx)
}
entryPtr := r.urlIndex[idx]
entry, err := r.parseEntryAt(int64(entryPtr))
if err != nil {
return nil, errors.WithStack(err)
}
r.cacheEntry(entryPtr, entry)
return entry, nil
}
func (r *Reader) EntryWithURL(ns Namespace, url string) (Entry, error) {
offset, found := r.getEntryOffsetByURLFromCache(ns, url)
if found {
entry, err := r.parseEntryAt(int64(offset))
if err != nil {
return nil, errors.WithStack(err)
}
return entry, nil
}
iterator := r.Entries()
for iterator.Next() {
entry := iterator.Entry()
if entry.Namespace() == ns && entry.URL() == url {
return entry, nil
}
}
if err := iterator.Err(); err != nil {
return nil, errors.WithStack(err)
}
return nil, errors.WithStack(ErrNotFound)
}
func (r *Reader) EntryWithTitle(title string) (Entry, error) {
offset, found := r.getEntryOffsetByTitleFromCache(title)
if found {
entry, err := r.parseEntryAt(int64(offset))
if err != nil {
return nil, errors.WithStack(err)
}
return entry, nil
}
iterator := r.Entries()
for iterator.Next() {
entry := iterator.Entry()
if entry.Title() == title {
return entry, nil
}
}
if err := iterator.Err(); err != nil {
return nil, errors.WithStack(err)
}
return nil, errors.WithStack(ErrNotFound)
}
func (r *Reader) getURLCacheKey(entry Entry) string {
return fmt.Sprintf("%s/%s", entry.Namespace(), entry.URL())
}
func (r *Reader) cacheEntry(offset uint64, entry Entry) {
urlKey := r.getURLCacheKey(entry)
r.urlCache.Add(urlKey, offset)
r.titleCache.Add(entry.Title(), offset)
}
func (r *Reader) getEntryOffsetByURLFromCache(namespace Namespace, url string) (uint64, bool) {
key := fmt.Sprintf("%s/%s", namespace, url)
return r.urlCache.Get(key)
}
func (r *Reader) getEntryOffsetByTitleFromCache(title string) (uint64, bool) {
return r.titleCache.Get(title)
}
func (r *Reader) parse() error {
if err := r.parseHeader(); err != nil {
return errors.WithStack(err)
}
if err := r.parseMimeTypes(); err != nil {
return errors.WithStack(err)
}
if err := r.parseURLIndex(); err != nil {
return errors.WithStack(err)
}
return nil
}
func (r *Reader) parseHeader() error {
magicNumber, err := r.readUint32At(0)
if err != nil {
return errors.WithStack(err)
}
if magicNumber != zimFormatMagicNumber {
return errors.Errorf("invalid zim magic number '%d'", magicNumber)
}
majorVersion, err := r.readUint16At(4)
if err != nil {
return errors.WithStack(err)
}
r.majorVersion = majorVersion
minorVersion, err := r.readUint16At(6)
if err != nil {
return errors.WithStack(err)
}
r.minorVersion = minorVersion
if err := r.parseUUID(); err != nil {
return errors.WithStack(err)
}
entryCount, err := r.readUint32At(24)
if err != nil {
return errors.WithStack(err)
}
r.entryCount = entryCount
clusterCount, err := r.readUint32At(28)
if err != nil {
return errors.WithStack(err)
}
r.clusterCount = clusterCount
urlPtrPos, err := r.readUint64At(32)
if err != nil {
return errors.WithStack(err)
}
r.urlPtrPos = urlPtrPos
titlePtrPos, err := r.readUint64At(40)
if err != nil {
return errors.WithStack(err)
}
r.titlePtrPos = titlePtrPos
clusterPtrPos, err := r.readUint64At(48)
if err != nil {
return errors.WithStack(err)
}
r.clusterPtrPos = clusterPtrPos
mimeListPos, err := r.readUint64At(56)
if err != nil {
return errors.WithStack(err)
}
r.mimeListPos = mimeListPos
mainPage, err := r.readUint32At(64)
if err != nil {
return errors.WithStack(err)
}
r.mainPage = mainPage
layoutPage, err := r.readUint32At(68)
if err != nil {
return errors.WithStack(err)
}
r.layoutPage = layoutPage
checksumPos, err := r.readUint64At(72)
if err != nil {
return errors.WithStack(err)
}
r.checksumPos = checksumPos
return nil
}
func (r *Reader) parseUUID() error {
data := make([]byte, 16)
if err := r.readRange(8, data); err != nil {
return errors.WithStack(err)
}
parts := make([]string, 0, 5)
val32, err := readUint32(data[0:4], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%08x", val32))
val16, err := readUint16(data[4:6], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%04x", val16))
val16, err = readUint16(data[6:8], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%04x", val16))
val16, err = readUint16(data[8:10], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%04x", val16))
val32, err = readUint32(data[10:14], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
val16, err = readUint16(data[14:16], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%x%x", val32, val16))
r.uuid = strings.Join(parts, "-")
return nil
}
func (r *Reader) parseMimeTypes() error {
mimeTypes := make([]string, 0)
offset := int64(r.mimeListPos)
for {
mimeType, read, err := r.readStringAt(offset)
if err != nil {
return errors.WithStack(err)
}
if mimeType == "" {
break
}
mimeTypes = append(mimeTypes, mimeType)
offset += read + 1
}
r.mimeTypes = mimeTypes
return nil
}
func (r *Reader) parseURLIndex() error {
urlIndex, err := r.parseEntryIndex(int64(r.urlPtrPos))
if err != nil {
return errors.WithStack(err)
}
r.urlIndex = urlIndex
return nil
}
func (r *Reader) parseEntryAt(offset int64) (Entry, error) {
base, err := r.parseBaseEntry(offset)
if err != nil {
return nil, errors.WithStack(err)
}
var entry Entry
if base.mimeTypeIndex == zimRedirect {
entry, err = r.parseRedirectEntry(offset, base)
if err != nil {
return nil, errors.WithStack(err)
}
} else {
entry, err = r.parseContentEntry(offset, base)
if err != nil {
return nil, errors.WithStack(err)
}
}
return entry, nil
}
func (r *Reader) parseEntryIndex(startAddr int64) ([]uint64, error) {
index := make([]uint64, r.entryCount)
data := make([]byte, 8)
for i := int64(0); i < int64(r.entryCount); i++ {
if err := r.readRange(startAddr+i*8, data); err != nil {
return nil, errors.WithStack(err)
}
ptr, err := readUint64(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
index[i] = ptr
}
return index, nil
}
func (r *Reader) readRange(offset int64, v []byte) error {
if _, err := r.seeker.Seek(offset, io.SeekStart); err != nil {
return errors.WithStack(err)
}
read, err := r.seeker.Read(v)
if err != nil {
return errors.WithStack(err)
}
if read != len(v) {
return errors.New("could not read enough bytes")
}
return nil
}
func (r *Reader) readUint32At(offset int64) (uint32, error) {
data := make([]byte, 4)
if err := r.readRange(offset, data); err != nil {
return 0, errors.WithStack(err)
}
value, err := readUint32(data, binary.LittleEndian)
if err != nil {
return 0, errors.WithStack(err)
}
return value, nil
}
func (r *Reader) readUint16At(offset int64) (uint16, error) {
data := make([]byte, 2)
if err := r.readRange(offset, data); err != nil {
return 0, errors.WithStack(err)
}
value, err := readUint16(data, binary.LittleEndian)
if err != nil {
return 0, errors.WithStack(err)
}
return value, nil
}
func (r *Reader) readUint64At(offset int64) (uint64, error) {
data := make([]byte, 8)
if err := r.readRange(offset, data); err != nil {
return 0, errors.WithStack(err)
}
value, err := readUint64(data, binary.LittleEndian)
if err != nil {
return 0, errors.WithStack(err)
}
return value, nil
}
func (r *Reader) readStringAt(offset int64) (string, int64, error) {
data := make([]byte, 1)
var sb strings.Builder
read := int64(0)
for {
if err := r.readRange(offset+read, data); err != nil {
return "", read, errors.WithStack(err)
}
if err := sb.WriteByte(data[0]); err != nil {
return "", read, errors.WithStack(err)
}
if data[0] == nullByte {
str := strings.TrimRight(sb.String(), "\x00")
return str, read, nil
}
read++
}
}
func (r *Reader) Close() error {
if err := r.seeker.Close(); err != nil {
return errors.WithStack(err)
}
return nil
}
func NewReader(seeker io.ReadSeekCloser, funcs ...OptionFunc) (*Reader, error) {
opts := NewOptions(funcs...)
urlCache, err := lru.New[string, uint64](opts.URLCacheSize)
if err != nil {
return nil, errors.WithStack(err)
}
titleCache, err := lru.New[string, uint64](opts.TitleCacheSize)
if err != nil {
return nil, errors.WithStack(err)
}
reader := &Reader{
seeker: seeker,
urlCache: urlCache,
titleCache: titleCache,
}
if err := reader.parse(); err != nil {
return nil, errors.WithStack(err)
}
return reader, nil
}
func Open(path string, funcs ...OptionFunc) (*Reader, error) {
file, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm)
if err != nil {
return nil, errors.WithStack(err)
}
reader, err := NewReader(file, funcs...)
if err != nil {
return nil, errors.WithStack(err)
}
return reader, nil
}

View File

@ -0,0 +1,83 @@
package zim
import (
"log"
"path/filepath"
"testing"
"github.com/davecgh/go-spew/spew"
"github.com/pkg/errors"
)
func TestReader(t *testing.T) {
files, err := filepath.Glob("testdata/*.zim")
if err != nil {
t.Fatalf("%+v", errors.WithStack(err))
}
for _, zf := range files {
testName := filepath.Base(zf)
t.Run(testName, func(t *testing.T) {
reader, err := Open(zf)
if err != nil {
t.Fatalf("%+v", errors.WithStack(err))
}
defer func() {
if err := reader.Close(); err != nil {
t.Fatalf("%+v", errors.WithStack(err))
}
}()
iterator := reader.Entries()
for iterator.Next() {
entry := iterator.Entry()
content, err := entry.Redirect()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
break
}
log.Printf("%s/%s: %s", content.Namespace(), content.URL(), content.Title())
contentReader, err := content.Reader()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
break
}
spew.Dump(contentReader)
}
if err := iterator.Err(); err != nil {
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
}
})
// entry, err := reader.EntryWithURL(V6NamespaceContent, "A/a.tile.openstreetmap.org/16/33682/22970.png")
// if err != nil {
// t.Fatalf("%+v", errors.WithStack(err))
// }
// content, err := entry.Redirect()
// if err != nil {
// t.Fatalf("%+v", errors.WithStack(err))
// }
// contentReader, err := content.Reader()
// if err != nil {
// t.Fatalf("%+v", errors.WithStack(err))
// break
// }
// data, err := io.ReadAll(contentReader)
// if err != nil {
// t.Fatalf("%+v", errors.WithStack(err))
// break
// }
// spew.Dump(data)
}
}

BIN
pkg/bundle/zim/testdata/cadoles.zim vendored Normal file

Binary file not shown.

52
pkg/bundle/zim/util.go Normal file
View File

@ -0,0 +1,52 @@
package zim
import (
"bytes"
"encoding/binary"
"github.com/pkg/errors"
)
// read a little endian uint64
func readUint64(b []byte, order binary.ByteOrder) (uint64, error) {
var v uint64
buf := bytes.NewBuffer(b)
if err := binary.Read(buf, order, &v); err != nil {
return 0, errors.WithStack(err)
}
return v, nil
}
// read a little endian uint32
func readUint32(b []byte, order binary.ByteOrder) (uint32, error) {
var v uint32
buf := bytes.NewBuffer(b)
if err := binary.Read(buf, order, &v); err != nil {
return 0, errors.WithStack(err)
}
return v, nil
}
// read a little endian uint16
func readUint16(b []byte, order binary.ByteOrder) (uint16, error) {
var v uint16
buf := bytes.NewBuffer(b)
if err := binary.Read(buf, order, &v); err != nil {
return 0, errors.WithStack(err)
}
return v, nil
}
// read a little endian uint8
func readUint8(b []byte, order binary.ByteOrder) (uint8, error) {
var v uint8
buf := bytes.NewBuffer(b)
if err := binary.Read(buf, order, &v); err != nil {
return 0, errors.WithStack(err)
}
return v, nil
}

View File

@ -1,150 +0,0 @@
package zim
import (
"log"
"testing"
"github.com/pkg/errors"
)
var Z *ZimReader
func init() {
var err error
Z, err = NewReader("testdata/wikibooks_af_all_maxi_2023-06.zim")
if err != nil {
log.Panicf("Can't read %v", err)
}
}
func TestOpen(t *testing.T) {
if Z.ArticleCount == 0 {
t.Errorf("No article found")
}
}
func TestMime(t *testing.T) {
if len(Z.MimeTypes()) == 0 {
t.Errorf("No mime types found")
}
}
func TestDisplayInfost(t *testing.T) {
info := Z.String()
if len(info) < 0 {
t.Errorf("Can't read infos")
}
t.Log(info)
}
func TestURLAtIdx(t *testing.T) {
// addr 0 is a redirect
p, _ := Z.OffsetAtURLIdx(5)
a, _ := Z.ArticleAt(p)
if a == nil {
t.Errorf("Can't find 1st url")
}
}
func TestDisplayArticle(t *testing.T) {
// addr 0 is a redirect
p, _ := Z.OffsetAtURLIdx(5)
a, _ := Z.ArticleAt(p)
if a == nil {
t.Errorf("Can't find 1st url")
}
t.Log(a)
}
func TestPageNoIndex(t *testing.T) {
a, _ := Z.GetPageNoIndex("A/Dracula:Capitol_1.html")
if a == nil {
t.Errorf("Can't find existing url")
}
}
func TestListArticles(t *testing.T) {
if testing.Short() {
t.Skip("skipping test in short mode.")
}
var i uint32
for a := range Z.ListArticles() {
i++
t.Log(a.String())
}
if i == 0 {
t.Errorf("Can't find any urls")
}
if i != Z.ArticleCount-1 {
t.Errorf("Can't find the exact ArticleCount urls %d vs %d", i, Z.ArticleCount)
}
}
func TestMainPage(t *testing.T) {
a, _ := Z.MainPage()
if a == nil {
t.Errorf("Can't find the mainpage article")
}
t.Log(a)
}
func TestFavicon(t *testing.T) {
favicon, err := Z.Favicon()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
if favicon == nil {
t.Errorf("Can't find the favicon article")
}
}
func TestMetadata(t *testing.T) {
metadata, err := Z.Metadata()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
if metadata == nil {
t.Errorf("Can't find the metadata")
}
}
func TestData(t *testing.T) {
// addr 0 is a redirect
p, _ := Z.OffsetAtURLIdx(2)
a, _ := Z.ArticleAt(p)
b, _ := a.Data()
data := string(b)
if a.EntryType != RedirectEntry {
if len(data) == 0 {
t.Error("can't read data")
}
}
t.Log(a.String())
t.Log(data)
}
func BenchmarkArticleBytes(b *testing.B) {
// addr 0 is a redirect
p, _ := Z.OffsetAtURLIdx(5)
a, _ := Z.ArticleAt(p)
if a == nil {
b.Errorf("Can't find 1st url")
}
data, err := a.Data()
if err != nil {
b.Error(err)
}
b.SetBytes(int64(len(data)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
a.Data()
bcache.Purge() // prevent memiozing value
}
}