wip: zim lib rewrite
arcad/edge/pipeline/head There was a failure building this commit Details

This commit is contained in:
wpetit 2023-10-11 11:18:32 +02:00
parent 8facff2bd2
commit d948a261d6
35 changed files with 2155 additions and 311 deletions

2
go.mod
View File

@ -1,6 +1,6 @@
module forge.cadoles.com/arcad/edge
go 1.20
go 1.21
require (
github.com/hashicorp/golang-lru/v2 v2.0.7

36
pkg/app/option.go Normal file
View File

@ -0,0 +1,36 @@
package app
import (
"context"
"github.com/pkg/errors"
"gitlab.com/wpetit/goweb/logger"
)
type Options struct {
ModuleFactories []ServerModuleFactory
ErrorHandler func(ctx context.Context, err error)
}
type OptionFunc func(opts *Options)
func NewOptions(funcs ...OptionFunc) *Options {
opts := &Options{
ModuleFactories: make([]ServerModuleFactory, 0),
ErrorHandler: func(ctx context.Context, err error) {
logger.Error(ctx, err.Error(), logger.E(errors.WithStack(err)))
},
}
for _, fn := range funcs {
fn(opts)
}
return opts
}
func WithModulesFactories(factories ...ServerModuleFactory) OptionFunc {
return func(opts *Options) {
opts.ModuleFactories = factories
}
}

View File

@ -150,10 +150,12 @@ func (a *Article) Data() ([]byte, error) {
if err != nil {
return nil, err
}
s, err := a.z.bytesRangeAt(start, start+1)
if err != nil {
return nil, err
}
compression := uint8(s[0])
// blob starts at offset, blob ends at offset

View File

@ -0,0 +1,7 @@
package zim
import "errors"
var (
ErrNotFound = errors.New("not found")
)

View File

@ -0,0 +1,49 @@
package zim
import "github.com/pkg/errors"
func (z *ZimReader) Favicon() (*Article, error) {
illustration, err := z.getMetadataIllustration()
if err != nil && !errors.Is(err, ErrNotFound) {
return nil, errors.WithStack(err)
}
if illustration != nil {
return illustration, nil
}
namespaces := []string{"-", "I"}
entryNames := []string{"favicon", "favicon.png"}
for _, ns := range namespaces {
for _, en := range entryNames {
article, err := z.GetPageNoIndex(ns + "/" + en)
if err != nil && !errors.Is(err, ErrNotFound) {
return nil, errors.WithStack(err)
}
if article != nil {
return article, nil
}
}
}
return nil, errors.WithStack(ErrNotFound)
}
func (z *ZimReader) getMetadataIllustration() (*Article, error) {
metadata, err := z.Metadata(MetadataIllustration96x96at2, MetadataIllustration48x48at1)
if err != nil {
return nil, errors.WithStack(err)
}
if _, exists := metadata[MetadataIllustration96x96at2]; exists {
return z.GetPageNoIndex("M/" + string(MetadataIllustration96x96at2))
}
if _, exists := metadata[MetadataIllustration48x48at1]; exists {
return z.GetPageNoIndex("M/" + string(MetadataIllustration48x48at1))
}
return nil, errors.WithStack(ErrNotFound)
}

View File

@ -0,0 +1,69 @@
package zim
import (
"github.com/pkg/errors"
)
type MetadataKey string
// See https://wiki.openzim.org/wiki/Metadata
const (
MetadataName MetadataKey = "Name"
MetadataTitle MetadataKey = "Title"
MetadataDescription MetadataKey = "Description"
MetadataLongDescription MetadataKey = "LongDescription"
MetadataCreator MetadataKey = "Creator"
MetadataTags MetadataKey = "Tags"
MetadataDate MetadataKey = "Date"
MetadataPublisher MetadataKey = "Publisher"
MetadataFlavour MetadataKey = "Flavour"
MetadataSource MetadataKey = "Source"
MetadataLanguage MetadataKey = "Language"
MetadataIllustration48x48at1 MetadataKey = "Illustration_48x48@1"
MetadataIllustration96x96at2 MetadataKey = "Illustration_96x96@2"
)
var knownKeys = []MetadataKey{
MetadataName,
MetadataTitle,
MetadataDescription,
MetadataLongDescription,
MetadataCreator,
MetadataPublisher,
MetadataLanguage,
MetadataTags,
MetadataDate,
MetadataFlavour,
MetadataSource,
MetadataIllustration48x48at1,
MetadataIllustration96x96at2,
}
// Metadata returns a copy of the internal metadata map of the ZIM file.
func (z *ZimReader) Metadata(keys ...MetadataKey) (map[MetadataKey]string, error) {
if len(keys) == 0 {
keys = knownKeys
}
metadata := make(map[MetadataKey]string)
for _, key := range keys {
article, err := z.GetPageNoIndex("M/" + string(key))
if err != nil {
if errors.Is(err, ErrNotFound) {
continue
}
return nil, errors.WithStack(err)
}
data, err := article.Data()
if errors.Is(err, ErrNotFound) {
continue
}
metadata[key] = string(data)
}
return metadata, nil
}

BIN
pkg/bundle/oldzim/testdata/cadoles.zim vendored Normal file

Binary file not shown.

View File

@ -100,6 +100,7 @@ func (z *ZimReader) ListArticles() <-chan *Article {
if art == nil {
// TODO: deal with redirect continue
continue
}
ch <- art
}

View File

@ -0,0 +1,153 @@
package zim
import (
"path/filepath"
"reflect"
"runtime"
"testing"
"github.com/pkg/errors"
)
var testCases = []func(t *testing.T, z *ZimReader){
testOpen,
testData,
testDisplayArticle,
testDisplayInfost,
testFavicon,
testListArticles,
testMainPage,
testMetadata,
testMime,
testURLAtIdx,
}
func TestZim(t *testing.T) {
zimFiles, err := filepath.Glob("testdata/*.zim")
if err != nil {
t.Fatalf("%+v", errors.WithStack(err))
}
for _, zf := range zimFiles {
zr, err := NewReader(zf)
if err != nil {
t.Fatalf("%+v", errors.WithStack(err))
}
base := filepath.Base(zf)
t.Run(base, func(t *testing.T) {
for _, fn := range testCases {
testName := runtime.FuncForPC(reflect.ValueOf(fn).Pointer()).Name()
t.Run(testName, func(t *testing.T) {
fn(t, zr)
})
}
})
}
}
func testOpen(t *testing.T, zr *ZimReader) {
if zr.ArticleCount == 0 {
t.Errorf("No article found")
}
}
func testMime(t *testing.T, zr *ZimReader) {
if len(zr.MimeTypes()) == 0 {
t.Errorf("No mime types found")
}
}
func testDisplayInfost(t *testing.T, zr *ZimReader) {
info := zr.String()
if len(info) < 0 {
t.Errorf("Can't read infos")
}
t.Log(info)
}
func testURLAtIdx(t *testing.T, zr *ZimReader) {
// addr 0 is a redirect
p, _ := zr.OffsetAtURLIdx(5)
a, _ := zr.ArticleAt(p)
if a == nil {
t.Errorf("Can't find 1st url")
}
}
func testDisplayArticle(t *testing.T, zr *ZimReader) {
// addr 0 is a redirect
p, _ := zr.OffsetAtURLIdx(5)
a, _ := zr.ArticleAt(p)
if a == nil {
t.Errorf("Can't find 1st url")
}
t.Log(a)
}
func testListArticles(t *testing.T, zr *ZimReader) {
if testing.Short() {
t.Skip("skipping test in short mode.")
}
var i uint32
for a := range zr.ListArticles() {
i++
t.Log(a.String())
}
if i == 0 {
t.Errorf("Can't find any urls")
}
if i != zr.ArticleCount-1 {
t.Errorf("Can't find the exact ArticleCount urls %d vs %d", i, zr.ArticleCount)
}
}
func testMainPage(t *testing.T, zr *ZimReader) {
a, _ := zr.MainPage()
if a == nil {
t.Errorf("Can't find the mainpage article")
}
t.Log(a)
}
func testFavicon(t *testing.T, zr *ZimReader) {
favicon, err := zr.Favicon()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
if favicon == nil {
t.Errorf("Can't find the favicon article")
}
}
func testMetadata(t *testing.T, zr *ZimReader) {
metadata, err := zr.Metadata()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
if metadata == nil {
t.Errorf("Can't find the metadata")
}
}
func testData(t *testing.T, zr *ZimReader) {
// addr 0 is a redirect
p, _ := zr.OffsetAtURLIdx(5)
a, _ := zr.ArticleAt(p)
b, _ := a.Data()
data := string(b)
if a.EntryType != RedirectEntry {
if len(data) == 0 {
t.Error("can't read data")
}
}
t.Log(a.String())
t.Log(data)
}

View File

@ -0,0 +1,8 @@
package zim
import "io"
type BlobReader interface {
io.ReadCloser
Size() (int64, error)
}

View File

@ -0,0 +1,163 @@
package zim
import (
"bytes"
"encoding/binary"
"io"
"os"
"sync"
"github.com/pkg/errors"
)
type CompressedBlobReader struct {
reader *Reader
decoderFactory BlobDecoderFactory
clusterStartOffset uint64
clusterEndOffset uint64
blobIndex uint32
blobSize int
readOffset uint64
loadCluster sync.Once
loadClusterErr error
data []byte
closed bool
}
// Size implements BlobReader.
func (r *CompressedBlobReader) Size() (int64, error) {
if err := r.loadClusterData(); err != nil {
return 0, errors.WithStack(err)
}
return int64(len(r.data)), nil
}
// Close implements io.ReadCloser.
func (r *CompressedBlobReader) Close() error {
clear(r.data)
r.closed = true
return nil
}
// Read implements io.ReadCloser.
func (r *CompressedBlobReader) Read(p []byte) (int, error) {
if err := r.loadClusterData(); err != nil {
return 0, errors.WithStack(err)
}
length := len(p)
remaining := len(r.data) - int(r.readOffset)
if length > remaining {
length = remaining
}
chunk := make([]byte, length)
copy(chunk, r.data[r.readOffset:int(r.readOffset)+length])
copy(p, chunk)
if length == remaining {
return length, io.EOF
}
r.readOffset += uint64(length)
return length, nil
}
func (r *CompressedBlobReader) loadClusterData() error {
if r.closed {
return errors.WithStack(os.ErrClosed)
}
r.loadCluster.Do(func() {
compressedData := make([]byte, r.clusterEndOffset-r.clusterStartOffset)
if err := r.reader.readRange(int64(r.clusterStartOffset+1), compressedData); err != nil {
r.loadClusterErr = errors.WithStack(err)
return
}
blobBuffer := bytes.NewBuffer(compressedData)
decoder, err := r.decoderFactory(blobBuffer)
if err != nil {
r.loadClusterErr = errors.WithStack(err)
return
}
defer decoder.Close()
uncompressedData, err := io.ReadAll(decoder)
if err != nil {
r.loadClusterErr = errors.WithStack(err)
return
}
var (
blobStart uint64
blobEnd uint64
)
if r.blobSize == 8 {
blobStart64, err := readUint64(uncompressedData[r.blobIndex*uint32(r.blobSize):r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize)], binary.LittleEndian)
if err != nil {
r.loadClusterErr = errors.WithStack(err)
return
}
blobStart = blobStart64
blobEnd64, err := readUint64(uncompressedData[r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize):r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize)+uint32(r.blobSize)], binary.LittleEndian)
if err != nil {
r.loadClusterErr = errors.WithStack(err)
return
}
blobEnd = blobEnd64
} else {
blobStart32, err := readUint32(uncompressedData[r.blobIndex*uint32(r.blobSize):r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize)], binary.LittleEndian)
if err != nil {
r.loadClusterErr = errors.WithStack(err)
return
}
blobStart = uint64(blobStart32)
blobEnd32, err := readUint32(uncompressedData[r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize):r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize)+uint32(r.blobSize)], binary.LittleEndian)
if err != nil {
r.loadClusterErr = errors.WithStack(err)
return
}
blobEnd = uint64(blobEnd32)
}
r.data = make([]byte, blobEnd-blobStart)
copy(r.data, uncompressedData[blobStart:blobEnd])
})
if r.loadClusterErr != nil {
return errors.WithStack(r.loadClusterErr)
}
return nil
}
type BlobDecoderFactory func(io.Reader) (io.ReadCloser, error)
func NewCompressedBlobReader(reader *Reader, decoderFactory BlobDecoderFactory, clusterStartOffset, clusterEndOffset uint64, blobIndex uint32, blobSize int) *CompressedBlobReader {
return &CompressedBlobReader{
reader: reader,
decoderFactory: decoderFactory,
clusterStartOffset: clusterStartOffset,
clusterEndOffset: clusterEndOffset,
blobIndex: blobIndex,
blobSize: blobSize,
readOffset: 0,
}
}
var _ BlobReader = &UncompressedBlobReader{}

View File

@ -0,0 +1,221 @@
package zim
import (
"encoding/binary"
"github.com/pkg/errors"
)
type zimCompression int
const (
zimCompressionNoneZeno zimCompression = 0
zimCompressionNone zimCompression = 1
zimCompressionNoneZLib zimCompression = 2
zimCompressionNoneBZip2 zimCompression = 3
zimCompressionNoneXZ zimCompression = 4
zimCompressionNoneZStandard zimCompression = 5
)
type ContentEntry struct {
*BaseEntry
mimeType string
clusterIndex uint32
blobIndex uint32
}
func (e *ContentEntry) Compression() (int, error) {
clusterHeader, _, _, err := e.readClusterInfo()
if err != nil {
return 0, errors.WithStack(err)
}
return int((clusterHeader << 4) >> 4), nil
}
func (e *ContentEntry) MimeType() string {
return e.mimeType
}
func (e *ContentEntry) Reader() (BlobReader, error) {
clusterHeader, clusterStartOffset, clusterEndOffset, err := e.readClusterInfo()
if err != nil {
return nil, errors.WithStack(err)
}
compression := (clusterHeader << 4) >> 4
extended := (clusterHeader<<3)>>7 == 1
blobSize := 4
if extended {
blobSize = 8
}
switch compression {
// Uncompressed blobs
case uint8(zimCompressionNoneZeno):
fallthrough
case uint8(zimCompressionNone):
startPos := clusterStartOffset + 1
blobOffset := uint64(e.blobIndex * uint32(blobSize))
data := make([]byte, blobSize)
var (
blobStart uint64
blobEnd uint64
)
if extended {
if err := e.reader.readRange(int64(startPos+blobOffset), data); err != nil {
return nil, errors.WithStack(err)
}
blobStart64, err := readUint64(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
blobStart = blobStart64
if err := e.reader.readRange(int64(startPos+blobOffset+uint64(blobSize)), data); err != nil {
return nil, errors.WithStack(err)
}
blobEnd64, err := readUint64(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
blobEnd = uint64(blobEnd64)
} else {
if err := e.reader.readRange(int64(startPos+blobOffset), data); err != nil {
return nil, errors.WithStack(err)
}
blobStart32, err := readUint32(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
blobStart = uint64(blobStart32)
if err := e.reader.readRange(int64(startPos+blobOffset+uint64(blobSize)), data); err != nil {
return nil, errors.WithStack(err)
}
blobEnd32, err := readUint32(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
blobEnd = uint64(blobEnd32)
}
return NewUncompressedBlobReader(e.reader, startPos+blobStart, startPos+blobEnd, blobSize), nil
// Supported compression algorithms
case uint8(zimCompressionNoneXZ):
return NewXZBlobReader(e.reader, clusterStartOffset, clusterEndOffset, e.blobIndex, blobSize), nil
case uint8(zimCompressionNoneZStandard):
return NewZStdBlobReader(e.reader, clusterStartOffset, clusterEndOffset, e.blobIndex, blobSize), nil
// Unsupported compression algorithms
case uint8(zimCompressionNoneZLib):
fallthrough
case uint8(zimCompressionNoneBZip2):
fallthrough
default:
return nil, errors.Wrapf(ErrCompressionAlgorithmNotSupported, "unexpected compression algorithm '%d'", compression)
}
}
func (e *ContentEntry) Redirect() (*ContentEntry, error) {
return e, nil
}
func (e *ContentEntry) readClusterInfo() (uint8, uint64, uint64, error) {
startClusterOffset, clusterEndOffset, err := e.reader.getClusterOffsets(int(e.clusterIndex))
if err != nil {
return 0, 0, 0, errors.WithStack(err)
}
data := make([]byte, 1)
if err := e.reader.readRange(int64(startClusterOffset), data); err != nil {
return 0, 0, 0, errors.WithStack(err)
}
clusterHeader := uint8(data[0])
return clusterHeader, startClusterOffset, clusterEndOffset, nil
}
func (r *Reader) parseContentEntry(offset int64, base *BaseEntry) (*ContentEntry, error) {
entry := &ContentEntry{
BaseEntry: base,
}
data := make([]byte, 2)
if err := r.readRange(offset, data); err != nil {
return nil, errors.WithStack(err)
}
mimeTypeIndex, err := readUint16(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
if mimeTypeIndex >= uint16(len(r.mimeTypes)) {
return nil, errors.Errorf("mime type index '%d' greater than mime types length '%d'", mimeTypeIndex, len(r.mimeTypes))
}
entry.mimeType = r.mimeTypes[mimeTypeIndex]
data = make([]byte, 1)
if err := r.readRange(offset+3, data); err != nil {
return nil, errors.WithStack(err)
}
entry.namespace = Namespace(data[0])
data = make([]byte, 4)
if err := r.readRange(offset+8, data); err != nil {
return nil, errors.WithStack(err)
}
clusterIndex, err := readUint32(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
entry.clusterIndex = clusterIndex
if err := r.readRange(offset+12, data); err != nil {
return nil, errors.WithStack(err)
}
blobIndex, err := readUint32(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
entry.blobIndex = blobIndex
url, read, err := r.readStringAt(offset + 16)
if err != nil {
return nil, errors.WithStack(err)
}
entry.url = url
title, _, err := r.readStringAt(offset + 16 + read + 1)
if err != nil {
return nil, errors.WithStack(err)
}
entry.title = title
return entry, nil
}

138
pkg/bundle/zim/entry.go Normal file
View File

@ -0,0 +1,138 @@
package zim
import (
"encoding/binary"
"fmt"
"github.com/pkg/errors"
)
type Entry interface {
Redirect() (*ContentEntry, error)
Namespace() Namespace
URL() string
FullURL() string
Title() string
}
type BaseEntry struct {
mimeTypeIndex uint16
namespace Namespace
url string
title string
reader *Reader
}
func (e *BaseEntry) Namespace() Namespace {
return e.namespace
}
func (e *BaseEntry) Title() string {
if e.title == "" {
return e.url
}
return e.title
}
func (e *BaseEntry) URL() string {
return e.url
}
func (e *BaseEntry) FullURL() string {
return toFullURL(e.Namespace(), e.URL())
}
func (r *Reader) parseBaseEntry(offset int64) (*BaseEntry, error) {
entry := &BaseEntry{
reader: r,
}
data := make([]byte, 2)
if err := r.readRange(offset, data); err != nil {
return nil, errors.WithStack(err)
}
mimeTypeIndex, err := readUint16(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
entry.mimeTypeIndex = mimeTypeIndex
data = make([]byte, 1)
if err := r.readRange(offset+3, data); err != nil {
return nil, errors.WithStack(err)
}
entry.namespace = Namespace(data[0])
return entry, nil
}
type RedirectEntry struct {
*BaseEntry
redirectIndex uint32
}
func (e *RedirectEntry) Redirect() (*ContentEntry, error) {
if e.redirectIndex >= uint32(len(e.reader.urlIndex)) {
return nil, errors.Wrapf(ErrInvalidEntryIndex, "entry index '%d' out of bounds", e.redirectIndex)
}
entryPtr := e.reader.urlIndex[e.redirectIndex]
entry, err := e.reader.parseEntryAt(int64(entryPtr))
if err != nil {
return nil, errors.WithStack(err)
}
entry, err = entry.Redirect()
if err != nil {
return nil, errors.WithStack(err)
}
contentEntry, ok := entry.(*ContentEntry)
if !ok {
return nil, errors.WithStack(ErrInvalidRedirect)
}
return contentEntry, nil
}
func (r *Reader) parseRedirectEntry(offset int64, base *BaseEntry) (*RedirectEntry, error) {
entry := &RedirectEntry{
BaseEntry: base,
}
data := make([]byte, 4)
if err := r.readRange(offset+8, data); err != nil {
return nil, errors.WithStack(err)
}
redirectIndex, err := readUint32(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
entry.redirectIndex = redirectIndex
url, read, err := r.readStringAt(offset + 12)
if err != nil {
return nil, errors.WithStack(err)
}
entry.url = url
title, _, err := r.readStringAt(offset + 12 + read)
if err != nil {
return nil, errors.WithStack(err)
}
entry.title = title
return entry, nil
}
func toFullURL(ns Namespace, url string) string {
return fmt.Sprintf("%s/%s", ns, url)
}

View File

@ -0,0 +1,46 @@
package zim
import "github.com/pkg/errors"
type EntryIterator struct {
index int
entry Entry
err error
reader *Reader
}
func (it *EntryIterator) Next() bool {
if it.err != nil {
return false
}
entryCount := it.reader.EntryCount()
if it.index >= int(entryCount-1) {
return false
}
entry, err := it.reader.EntryAt(it.index)
if err != nil {
it.err = errors.WithStack(err)
return false
}
it.entry = entry
it.index++
return true
}
func (it *EntryIterator) Err() error {
return it.err
}
func (it *EntryIterator) Index() int {
return it.index
}
func (it *EntryIterator) Entry() Entry {
return it.entry
}

View File

@ -2,4 +2,9 @@ package zim
import "errors"
var ErrNotFound = errors.New("not found")
var (
ErrInvalidEntryIndex = errors.New("invalid entry index")
ErrNotFound = errors.New("not found")
ErrInvalidRedirect = errors.New("invalid redirect")
ErrCompressionAlgorithmNotSupported = errors.New("compression algorithm not supported")
)

View File

@ -2,8 +2,8 @@ package zim
import "github.com/pkg/errors"
func (z *ZimReader) Favicon() (*Article, error) {
illustration, err := z.getMetadataIllustration()
func (r *Reader) Favicon() (*ContentEntry, error) {
illustration, err := r.getMetadataIllustration()
if err != nil && !errors.Is(err, ErrNotFound) {
return nil, errors.WithStack(err)
}
@ -12,37 +12,54 @@ func (z *ZimReader) Favicon() (*Article, error) {
return illustration, nil
}
namespaces := []string{"-", "I"}
entryNames := []string{"favicon", "favicon.png"}
namespaces := []Namespace{V5NamespaceLayout, V5NamespaceImageFile}
urls := []string{"favicon", "favicon.png"}
for _, ns := range namespaces {
for _, en := range entryNames {
article, err := z.GetPageNoIndex(ns + "/" + en)
for _, url := range urls {
entry, err := r.EntryWithURL(ns, url)
if err != nil && !errors.Is(err, ErrNotFound) {
return nil, errors.WithStack(err)
}
if article != nil {
return article, nil
if errors.Is(err, ErrNotFound) {
continue
}
content, err := entry.Redirect()
if err != nil {
return nil, errors.WithStack(err)
}
return content, nil
}
}
return nil, errors.WithStack(ErrNotFound)
}
func (z *ZimReader) getMetadataIllustration() (*Article, error) {
metadata, err := z.Metadata(MetadataIllustration96x96at2, MetadataIllustration48x48at1)
func (r *Reader) getMetadataIllustration() (*ContentEntry, error) {
keys := []MetadataKey{MetadataIllustration96x96at2, MetadataIllustration48x48at1}
metadata, err := r.Metadata(keys...)
if err != nil {
return nil, errors.WithStack(err)
}
if _, exists := metadata[MetadataIllustration96x96at2]; exists {
return z.GetPageNoIndex("M/" + string(MetadataIllustration96x96at2))
}
for _, k := range keys {
if _, exists := metadata[k]; exists {
entry, err := r.EntryWithURL(V5NamespaceMetadata, string(k))
if err != nil {
return nil, errors.WithStack(err)
}
if _, exists := metadata[MetadataIllustration48x48at1]; exists {
return z.GetPageNoIndex("M/" + string(MetadataIllustration48x48at1))
content, err := entry.Redirect()
if err != nil {
return nil, errors.WithStack(err)
}
return content, nil
}
}
return nil, errors.WithStack(ErrNotFound)

View File

@ -1,6 +1,8 @@
package zim
import (
"io"
"github.com/pkg/errors"
)
@ -40,7 +42,7 @@ var knownKeys = []MetadataKey{
}
// Metadata returns a copy of the internal metadata map of the ZIM file.
func (z *ZimReader) Metadata(keys ...MetadataKey) (map[MetadataKey]string, error) {
func (r *Reader) Metadata(keys ...MetadataKey) (map[MetadataKey]string, error) {
if len(keys) == 0 {
keys = knownKeys
}
@ -48,7 +50,7 @@ func (z *ZimReader) Metadata(keys ...MetadataKey) (map[MetadataKey]string, error
metadata := make(map[MetadataKey]string)
for _, key := range keys {
article, err := z.GetPageNoIndex("M/" + string(key))
entry, err := r.EntryWithURL(V5NamespaceMetadata, string(key))
if err != nil {
if errors.Is(err, ErrNotFound) {
continue
@ -57,9 +59,19 @@ func (z *ZimReader) Metadata(keys ...MetadataKey) (map[MetadataKey]string, error
return nil, errors.WithStack(err)
}
data, err := article.Data()
if errors.Is(err, ErrNotFound) {
continue
content, err := entry.Redirect()
if err != nil {
return nil, errors.WithStack(err)
}
reader, err := content.Reader()
if err != nil {
return nil, errors.WithStack(err)
}
data, err := io.ReadAll(reader)
if err != nil {
return nil, errors.WithStack(err)
}
metadata[key] = string(data)

View File

@ -0,0 +1,23 @@
package zim
type Namespace string
const (
V6NamespaceContent Namespace = "C"
V6NamespaceMetadata Namespace = "M"
V6NamespaceWellKnown Namespace = "W"
V6NamespaceSearch Namespace = "X"
)
const (
V5NamespaceLayout Namespace = "-"
V5NamespaceArticle Namespace = "A"
V5NamespaceArticleMetadata Namespace = "B"
V5NamespaceImageFile Namespace = "I"
V5NamespaceImageText Namespace = "J"
V5NamespaceMetadata Namespace = "M"
V5NamespaceCategoryText Namespace = "U"
V5NamespaceCategoryArticleList Namespace = "V"
V5NamespaceCategoryPerArticle Namespace = "W"
V5NamespaceSearch Namespace = "X"
)

30
pkg/bundle/zim/option.go Normal file
View File

@ -0,0 +1,30 @@
package zim
import "time"
type Options struct {
URLCacheSize int
URLCacheTTL time.Duration
CacheSize int
}
type OptionFunc func(opts *Options)
func NewOptions(funcs ...OptionFunc) *Options {
funcs = append([]OptionFunc{
WithCacheSize(2048),
}, funcs...)
opts := &Options{}
for _, fn := range funcs {
fn(opts)
}
return opts
}
func WithCacheSize(size int) OptionFunc {
return func(opts *Options) {
opts.CacheSize = size
}
}

568
pkg/bundle/zim/reader.go Normal file
View File

@ -0,0 +1,568 @@
package zim
import (
"context"
"encoding/binary"
"fmt"
"io"
"os"
"strings"
"sync"
lru "github.com/hashicorp/golang-lru/v2"
"github.com/pkg/errors"
"gitlab.com/wpetit/goweb/logger"
)
const zimFormatMagicNumber uint32 = 0x44D495A
const nullByte = '\x00'
const zimRedirect = 0xffff
type Reader struct {
majorVersion uint16
minorVersion uint16
uuid string
entryCount uint32
clusterCount uint32
urlPtrPos uint64
titlePtrPos uint64
clusterPtrPos uint64
mimeListPos uint64
mainPage uint32
layoutPage uint32
checksumPos uint64
mimeTypes []string
urlIndex []uint64
cache *lru.Cache[string, Entry]
seeker io.ReadSeekCloser
seekerLock sync.Mutex
}
func (r *Reader) Version() (majorVersion, minorVersion uint16) {
return r.majorVersion, r.minorVersion
}
func (r *Reader) EntryCount() uint32 {
return r.entryCount
}
func (r *Reader) ClusterCount() uint32 {
return r.clusterCount
}
func (r *Reader) UUID() string {
return r.uuid
}
func (r *Reader) Close() error {
if err := r.seeker.Close(); err != nil {
return errors.WithStack(err)
}
return nil
}
func (r *Reader) MainPage() (Entry, error) {
if r.mainPage == 0xffffffff {
return nil, errors.WithStack(ErrNotFound)
}
entry, err := r.EntryAt(int(r.mainPage))
if err != nil {
return nil, errors.WithStack(ErrNotFound)
}
return entry, nil
}
func (r *Reader) Entries() *EntryIterator {
return &EntryIterator{
reader: r,
}
}
func (r *Reader) EntryAt(idx int) (Entry, error) {
if idx >= len(r.urlIndex) || idx < 0 {
return nil, errors.Wrapf(ErrInvalidEntryIndex, "index '%d' out of bounds", idx)
}
entryPtr := r.urlIndex[idx]
entry, err := r.parseEntryAt(int64(entryPtr))
if err != nil {
return nil, errors.WithStack(err)
}
r.cacheEntry(entryPtr, entry)
return entry, nil
}
func (r *Reader) EntryWithURL(ns Namespace, url string) (Entry, error) {
entry, found := r.getEntryByURLFromCache(toFullURL(ns, url))
if found {
logger.Debug(context.Background(), "found entry with url from cache", logger.F("fullURL", entry.FullURL()))
return entry, nil
}
iterator := r.Entries()
for iterator.Next() {
entry := iterator.Entry()
if entry.Namespace() == ns && (entry.URL() == url || entry.FullURL() == url) {
return entry, nil
}
}
if err := iterator.Err(); err != nil {
return nil, errors.WithStack(err)
}
return nil, errors.WithStack(ErrNotFound)
}
func (r *Reader) EntryWithTitle(ns Namespace, title string) (Entry, error) {
entry, found := r.getEntryByTitleFromCache(ns, title)
if found {
logger.Debug(context.Background(), "found entry with title from cache", logger.F("entry", entry.FullURL()))
return entry, nil
}
iterator := r.Entries()
for iterator.Next() {
entry := iterator.Entry()
if entry.Title() == title && entry.Namespace() == ns {
return entry, nil
}
}
if err := iterator.Err(); err != nil {
return nil, errors.WithStack(err)
}
return nil, errors.WithStack(ErrNotFound)
}
func (r *Reader) getURLCacheKey(fullURL string) string {
return "url:" + fullURL
}
func (r *Reader) getTitleCacheKey(ns Namespace, title string) string {
return fmt.Sprintf("title:%s/%s", ns, title)
}
func (r *Reader) cacheEntry(offset uint64, entry Entry) {
urlKey := r.getURLCacheKey(entry.FullURL())
titleKey := r.getTitleCacheKey(entry.Namespace(), entry.Title())
_, urlFound := r.cache.Peek(urlKey)
_, titleFound := r.cache.Peek(titleKey)
if urlFound && titleFound {
return
}
r.cache.Add(urlKey, entry)
r.cache.Add(titleKey, entry)
}
func (r *Reader) getEntryByURLFromCache(fullURL string) (Entry, bool) {
key := r.getURLCacheKey(fullURL)
return r.cache.Get(key)
}
func (r *Reader) getEntryByTitleFromCache(namespace Namespace, title string) (Entry, bool) {
key := r.getTitleCacheKey(namespace, title)
return r.cache.Get(key)
}
func (r *Reader) parse() error {
if err := r.parseHeader(); err != nil {
return errors.WithStack(err)
}
if err := r.parseMimeTypes(); err != nil {
return errors.WithStack(err)
}
if err := r.parseURLIndex(); err != nil {
return errors.WithStack(err)
}
return nil
}
func (r *Reader) parseHeader() error {
magicNumber, err := r.readUint32At(0)
if err != nil {
return errors.WithStack(err)
}
if magicNumber != zimFormatMagicNumber {
return errors.Errorf("invalid zim magic number '%d'", magicNumber)
}
majorVersion, err := r.readUint16At(4)
if err != nil {
return errors.WithStack(err)
}
r.majorVersion = majorVersion
minorVersion, err := r.readUint16At(6)
if err != nil {
return errors.WithStack(err)
}
r.minorVersion = minorVersion
if err := r.parseUUID(); err != nil {
return errors.WithStack(err)
}
entryCount, err := r.readUint32At(24)
if err != nil {
return errors.WithStack(err)
}
r.entryCount = entryCount
clusterCount, err := r.readUint32At(28)
if err != nil {
return errors.WithStack(err)
}
r.clusterCount = clusterCount
urlPtrPos, err := r.readUint64At(32)
if err != nil {
return errors.WithStack(err)
}
r.urlPtrPos = urlPtrPos
titlePtrPos, err := r.readUint64At(40)
if err != nil {
return errors.WithStack(err)
}
r.titlePtrPos = titlePtrPos
clusterPtrPos, err := r.readUint64At(48)
if err != nil {
return errors.WithStack(err)
}
r.clusterPtrPos = clusterPtrPos
mimeListPos, err := r.readUint64At(56)
if err != nil {
return errors.WithStack(err)
}
r.mimeListPos = mimeListPos
mainPage, err := r.readUint32At(64)
if err != nil {
return errors.WithStack(err)
}
r.mainPage = mainPage
layoutPage, err := r.readUint32At(68)
if err != nil {
return errors.WithStack(err)
}
r.layoutPage = layoutPage
checksumPos, err := r.readUint64At(72)
if err != nil {
return errors.WithStack(err)
}
r.checksumPos = checksumPos
return nil
}
func (r *Reader) parseUUID() error {
data := make([]byte, 16)
if err := r.readRange(8, data); err != nil {
return errors.WithStack(err)
}
parts := make([]string, 0, 5)
val32, err := readUint32(data[0:4], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%08x", val32))
val16, err := readUint16(data[4:6], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%04x", val16))
val16, err = readUint16(data[6:8], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%04x", val16))
val16, err = readUint16(data[8:10], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%04x", val16))
val32, err = readUint32(data[10:14], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
val16, err = readUint16(data[14:16], binary.BigEndian)
if err != nil {
return errors.WithStack(err)
}
parts = append(parts, fmt.Sprintf("%x%x", val32, val16))
r.uuid = strings.Join(parts, "-")
return nil
}
func (r *Reader) parseMimeTypes() error {
mimeTypes := make([]string, 0)
offset := int64(r.mimeListPos)
for {
mimeType, read, err := r.readStringAt(offset)
if err != nil {
return errors.WithStack(err)
}
if mimeType == "" {
break
}
mimeTypes = append(mimeTypes, mimeType)
offset += read + 1
}
r.mimeTypes = mimeTypes
return nil
}
func (r *Reader) parseURLIndex() error {
urlIndex, err := r.parseEntryIndex(int64(r.urlPtrPos))
if err != nil {
return errors.WithStack(err)
}
r.urlIndex = urlIndex
return nil
}
func (r *Reader) parseEntryAt(offset int64) (Entry, error) {
base, err := r.parseBaseEntry(offset)
if err != nil {
return nil, errors.WithStack(err)
}
var entry Entry
if base.mimeTypeIndex == zimRedirect {
entry, err = r.parseRedirectEntry(offset, base)
if err != nil {
return nil, errors.WithStack(err)
}
} else {
entry, err = r.parseContentEntry(offset, base)
if err != nil {
return nil, errors.WithStack(err)
}
}
return entry, nil
}
func (r *Reader) parseEntryIndex(startAddr int64) ([]uint64, error) {
index := make([]uint64, r.entryCount)
data := make([]byte, 8)
for i := int64(0); i < int64(r.entryCount); i++ {
if err := r.readRange(startAddr+i*8, data); err != nil {
return nil, errors.WithStack(err)
}
ptr, err := readUint64(data, binary.LittleEndian)
if err != nil {
return nil, errors.WithStack(err)
}
index[i] = ptr
}
return index, nil
}
func (r *Reader) getClusterOffsets(clusterIndex int) (uint64, uint64, error) {
data := make([]byte, 8)
startClusterPtrOffset := r.clusterPtrPos + (uint64(clusterIndex) * 8)
if err := r.readRange(int64(startClusterPtrOffset), data); err != nil {
return 0, 0, errors.WithStack(err)
}
startClusterOffset, err := readUint64(data, binary.LittleEndian)
if err != nil {
return 0, 0, errors.WithStack(err)
}
endClusterPtrOffset := r.clusterPtrPos + (uint64(clusterIndex+1) * 8)
if err := r.readRange(int64(endClusterPtrOffset), data); err != nil {
return 0, 0, errors.WithStack(err)
}
endClusterOffset, err := readUint64(data, binary.LittleEndian)
if err != nil {
return 0, 0, errors.WithStack(err)
}
endClusterOffset--
return startClusterOffset, endClusterOffset, nil
}
func (r *Reader) readRange(offset int64, v []byte) error {
r.seekerLock.Lock()
defer r.seekerLock.Unlock()
if _, err := r.seeker.Seek(offset, io.SeekStart); err != nil {
return errors.WithStack(err)
}
read, err := r.seeker.Read(v)
if err != nil {
return errors.WithStack(err)
}
if read != len(v) {
return errors.New("could not read enough bytes")
}
return nil
}
func (r *Reader) readUint32At(offset int64) (uint32, error) {
data := make([]byte, 4)
if err := r.readRange(offset, data); err != nil {
return 0, errors.WithStack(err)
}
value, err := readUint32(data, binary.LittleEndian)
if err != nil {
return 0, errors.WithStack(err)
}
return value, nil
}
func (r *Reader) readUint16At(offset int64) (uint16, error) {
data := make([]byte, 2)
if err := r.readRange(offset, data); err != nil {
return 0, errors.WithStack(err)
}
value, err := readUint16(data, binary.LittleEndian)
if err != nil {
return 0, errors.WithStack(err)
}
return value, nil
}
func (r *Reader) readUint64At(offset int64) (uint64, error) {
data := make([]byte, 8)
if err := r.readRange(offset, data); err != nil {
return 0, errors.WithStack(err)
}
value, err := readUint64(data, binary.LittleEndian)
if err != nil {
return 0, errors.WithStack(err)
}
return value, nil
}
func (r *Reader) readStringAt(offset int64) (string, int64, error) {
data := make([]byte, 1)
var sb strings.Builder
read := int64(0)
for {
if err := r.readRange(offset+read, data); err != nil {
return "", read, errors.WithStack(err)
}
if err := sb.WriteByte(data[0]); err != nil {
return "", read, errors.WithStack(err)
}
if data[0] == nullByte {
str := strings.TrimRight(sb.String(), "\x00")
return str, read, nil
}
read++
}
}
func NewReader(seeker io.ReadSeekCloser, funcs ...OptionFunc) (*Reader, error) {
opts := NewOptions(funcs...)
cache, err := lru.New[string, Entry](opts.CacheSize)
if err != nil {
return nil, errors.WithStack(err)
}
reader := &Reader{
seeker: seeker,
cache: cache,
}
if err := reader.parse(); err != nil {
return nil, errors.WithStack(err)
}
return reader, nil
}
func Open(path string, funcs ...OptionFunc) (*Reader, error) {
file, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm)
if err != nil {
return nil, errors.WithStack(err)
}
reader, err := NewReader(file, funcs...)
if err != nil {
return nil, errors.WithStack(err)
}
return reader, nil
}

View File

@ -0,0 +1,133 @@
package zim
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"testing"
"github.com/pkg/errors"
"gitlab.com/wpetit/goweb/logger"
)
type readerTestCase struct {
UUID string `json:"uuid"`
EntryCount uint32 `json:"entryCount"`
Entries []struct {
Namespace Namespace `json:"namespace"`
URL string `json:"url"`
Size int64 `json:"size"`
Compression int `json:"compression"`
MimeType string `json:"mimeType"`
Title string `json:"title"`
} `json:"entries"`
}
func TestReader(t *testing.T) {
if testing.Verbose() {
logger.SetLevel(logger.LevelDebug)
logger.SetFormat(logger.FormatHuman)
}
files, err := filepath.Glob("testdata/*.zim")
if err != nil {
t.Fatalf("%+v", errors.WithStack(err))
}
for _, zf := range files {
testName := filepath.Base(zf)
testCase, err := loadZimFileTestCase(zf)
if err != nil {
t.Fatalf("%+v", errors.WithStack(err))
}
t.Run(testName, func(t *testing.T) {
reader, err := Open(zf)
if err != nil {
t.Fatalf("%+v", errors.WithStack(err))
}
defer func() {
if err := reader.Close(); err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
}()
if e, g := testCase.UUID, reader.UUID(); e != g {
t.Errorf("reader.UUID(): expected '%s', got '%s'", e, g)
}
if e, g := testCase.EntryCount, reader.EntryCount(); e != g {
t.Errorf("reader.EntryCount(): expected '%v', got '%v'", e, g)
}
if testCase.Entries == nil {
return
}
for _, entryTestCase := range testCase.Entries {
testName := fmt.Sprintf("Entry/%s/%s", entryTestCase.Namespace, entryTestCase.URL)
t.Run(testName, func(t *testing.T) {
entry, err := reader.EntryWithURL(entryTestCase.Namespace, entryTestCase.URL)
if err != nil {
t.Fatalf("%+v", errors.WithStack(err))
}
content, err := entry.Redirect()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
if e, g := entryTestCase.MimeType, content.MimeType(); e != g {
t.Errorf("content.MimeType(): expected '%v', got '%v'", e, g)
}
if e, g := entryTestCase.Title, content.Title(); e != g {
t.Errorf("content.Title(): expected '%v', got '%v'", e, g)
}
compression, err := content.Compression()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
if e, g := entryTestCase.Compression, compression; e != g {
t.Errorf("content.Compression(): expected '%v', got '%v'", e, g)
}
contentReader, err := content.Reader()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
size, err := contentReader.Size()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
if e, g := entryTestCase.Size, size; e != g {
t.Errorf("content.Size(): expected '%v', got '%v'", e, g)
}
})
}
})
}
}
func loadZimFileTestCase(zimFile string) (*readerTestCase, error) {
testCaseFile, _ := strings.CutSuffix(zimFile, ".zim")
data, err := os.ReadFile(testCaseFile + ".json")
if err != nil {
return nil, errors.WithStack(err)
}
testCase := &readerTestCase{}
if err := json.Unmarshal(data, testCase); err != nil {
return nil, errors.WithStack(err)
}
return testCase, nil
}

View File

@ -0,0 +1,14 @@
{
"uuid": "8d141c3b-115d-bf73-294a-ee3c2e6b97b0",
"entryCount": 6223,
"entries": [
{
"namespace": "C",
"url": "users_page=9",
"compression": 5,
"size": 58646,
"mimeType": "text/html",
"title": "users_page=9"
}
]
}

Binary file not shown.

22
pkg/bundle/zim/testdata/cadoles.json vendored Normal file
View File

@ -0,0 +1,22 @@
{
"uuid": "cf81f094-d802-c790-b854-c74ad9701ddb",
"entryCount": 271,
"entries": [
{
"namespace": "C",
"url": "blog/202206-ShowroomInnovation.jpg",
"compression": 1,
"size": 260260,
"mimeType": "image/jpeg",
"title": "blog/202206-ShowroomInnovation.jpg"
},
{
"namespace": "C",
"url": "team/index.html",
"compression": 5,
"size": 93185,
"mimeType": "text/html",
"title": "Cadoles - Notre équipe"
}
]
}

BIN
pkg/bundle/zim/testdata/cadoles.zim vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,14 @@
{
"uuid": "ad4f406c-2021-2db8-c729-297568bbe376",
"entryCount": 330,
"entries": [
{
"namespace": "M",
"url": "Illustration_48x48@1",
"compression": 5,
"size": 5365,
"mimeType": "text/plain",
"title": "Illustration_48x48@1"
}
]
}

View File

@ -0,0 +1,63 @@
package zim
import (
"io"
"github.com/pkg/errors"
)
type UncompressedBlobReader struct {
reader *Reader
blobStartOffset uint64
blobEndOffset uint64
blobSize int
readOffset uint64
}
// Size implements BlobReader.
func (r *UncompressedBlobReader) Size() (int64, error) {
return int64(r.blobEndOffset - r.blobStartOffset), nil
}
// Close implements io.ReadCloser.
func (*UncompressedBlobReader) Close() error {
return nil
}
// Read implements io.ReadCloser.
func (r *UncompressedBlobReader) Read(p []byte) (n int, err error) {
len := len(p)
remaining := int(r.blobEndOffset - (r.blobStartOffset + r.readOffset))
if len > remaining {
len = remaining
}
data := make([]byte, len)
err = r.reader.readRange(int64(r.blobStartOffset+r.readOffset), data)
if err != nil {
return len, errors.WithStack(err)
}
r.readOffset += uint64(len)
copy(p, data)
if len == remaining {
return len, io.EOF
}
return len, nil
}
func NewUncompressedBlobReader(reader *Reader, blobStartOffset, blobEndOffset uint64, blobSize int) *UncompressedBlobReader {
return &UncompressedBlobReader{
reader: reader,
blobStartOffset: blobStartOffset,
blobEndOffset: blobEndOffset,
blobSize: blobSize,
readOffset: 0,
}
}
var _ BlobReader = &UncompressedBlobReader{}

52
pkg/bundle/zim/util.go Normal file
View File

@ -0,0 +1,52 @@
package zim
import (
"bytes"
"encoding/binary"
"github.com/pkg/errors"
)
// read a little endian uint64
func readUint64(b []byte, order binary.ByteOrder) (uint64, error) {
var v uint64
buf := bytes.NewBuffer(b)
if err := binary.Read(buf, order, &v); err != nil {
return 0, errors.WithStack(err)
}
return v, nil
}
// read a little endian uint32
func readUint32(b []byte, order binary.ByteOrder) (uint32, error) {
var v uint32
buf := bytes.NewBuffer(b)
if err := binary.Read(buf, order, &v); err != nil {
return 0, errors.WithStack(err)
}
return v, nil
}
// read a little endian uint16
func readUint16(b []byte, order binary.ByteOrder) (uint16, error) {
var v uint16
buf := bytes.NewBuffer(b)
if err := binary.Read(buf, order, &v); err != nil {
return 0, errors.WithStack(err)
}
return v, nil
}
// read a little endian uint8
func readUint8(b []byte, order binary.ByteOrder) (uint8, error) {
var v uint8
buf := bytes.NewBuffer(b)
if err := binary.Read(buf, order, &v); err != nil {
return 0, errors.WithStack(err)
}
return v, nil
}

View File

@ -0,0 +1,42 @@
package zim
import (
"io"
"github.com/pkg/errors"
"github.com/ulikunitz/xz"
)
type XZBlobReader struct {
decoder *xz.Reader
}
// Close implements io.ReadCloser.
func (r *XZBlobReader) Close() error {
return nil
}
// Read implements io.ReadCloser.
func (r *XZBlobReader) Read(p []byte) (n int, err error) {
return r.decoder.Read(p)
}
var _ io.ReadCloser = &XZBlobReader{}
func NewXZBlobReader(reader *Reader, clusterStartOffset, clusterEndOffset uint64, blobIndex uint32, blobSize int) *CompressedBlobReader {
return NewCompressedBlobReader(
reader,
func(r io.Reader) (io.ReadCloser, error) {
decoder, err := xz.NewReader(r)
if err != nil {
return nil, errors.WithStack(err)
}
return &XZBlobReader{decoder}, nil
},
clusterStartOffset,
clusterEndOffset,
blobIndex,
blobSize,
)
}

View File

@ -1,150 +0,0 @@
package zim
import (
"log"
"testing"
"github.com/pkg/errors"
)
var Z *ZimReader
func init() {
var err error
Z, err = NewReader("testdata/wikibooks_af_all_maxi_2023-06.zim")
if err != nil {
log.Panicf("Can't read %v", err)
}
}
func TestOpen(t *testing.T) {
if Z.ArticleCount == 0 {
t.Errorf("No article found")
}
}
func TestMime(t *testing.T) {
if len(Z.MimeTypes()) == 0 {
t.Errorf("No mime types found")
}
}
func TestDisplayInfost(t *testing.T) {
info := Z.String()
if len(info) < 0 {
t.Errorf("Can't read infos")
}
t.Log(info)
}
func TestURLAtIdx(t *testing.T) {
// addr 0 is a redirect
p, _ := Z.OffsetAtURLIdx(5)
a, _ := Z.ArticleAt(p)
if a == nil {
t.Errorf("Can't find 1st url")
}
}
func TestDisplayArticle(t *testing.T) {
// addr 0 is a redirect
p, _ := Z.OffsetAtURLIdx(5)
a, _ := Z.ArticleAt(p)
if a == nil {
t.Errorf("Can't find 1st url")
}
t.Log(a)
}
func TestPageNoIndex(t *testing.T) {
a, _ := Z.GetPageNoIndex("A/Dracula:Capitol_1.html")
if a == nil {
t.Errorf("Can't find existing url")
}
}
func TestListArticles(t *testing.T) {
if testing.Short() {
t.Skip("skipping test in short mode.")
}
var i uint32
for a := range Z.ListArticles() {
i++
t.Log(a.String())
}
if i == 0 {
t.Errorf("Can't find any urls")
}
if i != Z.ArticleCount-1 {
t.Errorf("Can't find the exact ArticleCount urls %d vs %d", i, Z.ArticleCount)
}
}
func TestMainPage(t *testing.T) {
a, _ := Z.MainPage()
if a == nil {
t.Errorf("Can't find the mainpage article")
}
t.Log(a)
}
func TestFavicon(t *testing.T) {
favicon, err := Z.Favicon()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
if favicon == nil {
t.Errorf("Can't find the favicon article")
}
}
func TestMetadata(t *testing.T) {
metadata, err := Z.Metadata()
if err != nil {
t.Errorf("%+v", errors.WithStack(err))
}
if metadata == nil {
t.Errorf("Can't find the metadata")
}
}
func TestData(t *testing.T) {
// addr 0 is a redirect
p, _ := Z.OffsetAtURLIdx(2)
a, _ := Z.ArticleAt(p)
b, _ := a.Data()
data := string(b)
if a.EntryType != RedirectEntry {
if len(data) == 0 {
t.Error("can't read data")
}
}
t.Log(a.String())
t.Log(data)
}
func BenchmarkArticleBytes(b *testing.B) {
// addr 0 is a redirect
p, _ := Z.OffsetAtURLIdx(5)
a, _ := Z.ArticleAt(p)
if a == nil {
b.Errorf("Can't find 1st url")
}
data, err := a.Data()
if err != nil {
b.Error(err)
}
b.SetBytes(int64(len(data)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
a.Data()
bcache.Purge() // prevent memiozing value
}
}

View File

@ -0,0 +1,43 @@
package zim
import (
"io"
"github.com/klauspost/compress/zstd"
"github.com/pkg/errors"
)
type ZstdBlobReader struct {
decoder *zstd.Decoder
}
// Close implements io.ReadCloser.
func (r *ZstdBlobReader) Close() error {
r.decoder.Close()
return nil
}
// Read implements io.ReadCloser.
func (r *ZstdBlobReader) Read(p []byte) (n int, err error) {
return r.decoder.Read(p)
}
var _ io.ReadCloser = &ZstdBlobReader{}
func NewZStdBlobReader(reader *Reader, clusterStartOffset, clusterEndOffset uint64, blobIndex uint32, blobSize int) *CompressedBlobReader {
return NewCompressedBlobReader(
reader,
func(r io.Reader) (io.ReadCloser, error) {
decoder, err := zstd.NewReader(r)
if err != nil {
return nil, errors.WithStack(err)
}
return &ZstdBlobReader{decoder}, nil
},
clusterStartOffset,
clusterEndOffset,
blobIndex,
blobSize,
)
}

View File

@ -3,19 +3,18 @@ package bundle
import (
"bytes"
"context"
"fmt"
"io"
"io/fs"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"golang.org/x/net/html"
"forge.cadoles.com/arcad/edge/pkg/bundle/zim"
lru "github.com/hashicorp/golang-lru/v2"
"github.com/pkg/errors"
"gitlab.com/wpetit/goweb/logger"
"gopkg.in/yaml.v2"
@ -23,6 +22,12 @@ import (
type ZimBundle struct {
archivePath string
initOnce sync.Once
initErr error
reader *zim.Reader
urlNamespaceCache *lru.Cache[string, zim.Namespace]
}
func (b *ZimBundle) File(filename string) (io.ReadCloser, os.FileInfo, error) {
@ -41,7 +46,7 @@ func (b *ZimBundle) File(filename string) (io.ReadCloser, os.FileInfo, error) {
case "public":
return b.renderDirectory(ctx, filename)
case "public/index.html":
return b.redirectToMainPage(ctx, filename)
return b.renderMainPage(ctx, filename)
default:
return b.renderURL(ctx, filename)
@ -49,58 +54,16 @@ func (b *ZimBundle) File(filename string) (io.ReadCloser, os.FileInfo, error) {
}
func (b *ZimBundle) Dir(dirname string) ([]os.FileInfo, error) {
reader, err := b.openArchive()
if err != nil {
return nil, err
}
defer func() {
if err := reader.Close(); err != nil {
panic(errors.WithStack(err))
}
}()
files := make([]os.FileInfo, 0)
// ctx := context.Background()
// for _, f := range reader.File {
// if !strings.HasPrefix(f.Name, dirname) {
// continue
// }
// relPath, err := filepath.Rel(dirname, f.Name)
// if err != nil {
// return nil, errors.Wrap(err, "could not get relative path")
// }
// logger.Debug(
// ctx, "checking file prefix",
// logger.F("dirname", dirname),
// logger.F("filename", f.Name),
// logger.F("relpath", relPath),
// )
// if relPath == filepath.Base(f.Name) {
// files = append(files, f.FileInfo())
// }
// }
return files, nil
}
func (b *ZimBundle) renderFakeManifest(ctx context.Context) (io.ReadCloser, os.FileInfo, error) {
reader, err := b.openArchive()
if err != nil {
if err := b.init(); err != nil {
return nil, nil, errors.WithStack(err)
}
defer func() {
if err := reader.Close(); err != nil {
panic(errors.WithStack(err))
}
}()
metadata, err := reader.Metadata()
metadata, err := b.reader.Metadata()
if err != nil {
return nil, nil, errors.WithStack(err)
}
@ -117,7 +80,7 @@ func (b *ZimBundle) renderFakeManifest(ctx context.Context) (io.ReadCloser, os.F
manifest["id"] = strings.ToLower(replacer.Replace(name)) + ".zim.edge.app"
} else {
manifest["id"] = strconv.FormatUint(uint64(reader.UUID), 10) + ".zim.edge.app"
manifest["id"] = b.reader.UUID() + ".zim.edge.app"
}
if title, exists := metadata[zim.MetadataTitle]; exists {
@ -130,7 +93,7 @@ func (b *ZimBundle) renderFakeManifest(ctx context.Context) (io.ReadCloser, os.F
manifest["description"] = description
}
favicon, err := reader.Favicon()
favicon, err := b.reader.Favicon()
if err != nil && !errors.Is(err, zim.ErrNotFound) {
return nil, nil, errors.WithStack(err)
}
@ -165,7 +128,7 @@ func (b *ZimBundle) renderFakeManifest(ctx context.Context) (io.ReadCloser, os.F
}
buf := bytes.NewBuffer(data)
file := ioutil.NopCloser(buf)
file := io.NopCloser(buf)
return file, stat, nil
}
@ -180,62 +143,77 @@ func (b *ZimBundle) renderFakeServerMain(ctx context.Context) (io.ReadCloser, os
}
buf := bytes.NewBuffer(nil)
file := ioutil.NopCloser(buf)
file := io.NopCloser(buf)
return file, stat, nil
}
func (b *ZimBundle) renderURL(ctx context.Context, url string) (io.ReadCloser, os.FileInfo, error) {
zr, err := b.openArchive()
if err != nil {
if err := b.init(); err != nil {
return nil, nil, errors.WithStack(err)
}
defer func() {
if err := zr.Close(); err != nil {
panic(errors.WithStack(err))
}
}()
filename := filepath.Base(url)
url = strings.TrimPrefix(url, "public/")
article, err := zr.GetPageNoIndex(url)
entry, err := b.searchEntryFromURL(ctx, url)
if err != nil {
if errors.Is(err, zim.ErrNotFound) {
return nil, nil, errors.WithStack(fs.ErrNotExist)
return nil, nil, os.ErrNotExist
}
return nil, nil, errors.WithStack(err)
}
if article.EntryType == zim.RedirectEntry {
redirectIndex, err := article.RedirectIndex()
if err != nil {
return nil, nil, errors.WithStack(err)
}
logger.Debug(
ctx, "found zim entry",
logger.F("webURL", url),
logger.F("zimFullURL", entry.FullURL()),
)
ra, err := zr.ArticleAtURLIdx(redirectIndex)
if err != nil {
return nil, nil, errors.WithStack(err)
}
return b.renderRedirect(ctx, filename, ra.FullURL())
}
data, err := article.Data()
content, err := entry.Redirect()
if err != nil {
return nil, nil, errors.WithStack(err)
}
mimeType := article.MimeType()
if mimeType == "text/html" {
injected, err := b.injectEdgeScriptTag(data)
if err != nil {
logger.Error(ctx, "could not inject edge script", logger.E(errors.WithStack(err)))
} else {
data = injected
contentReader, err := content.Reader()
if err != nil {
return nil, nil, errors.WithStack(err)
}
size, err := contentReader.Size()
if err != nil {
return nil, nil, errors.WithStack(err)
}
mimeType := content.MimeType()
if mimeType != "text/html" {
zimFile := &zimFile{
fileInfo: &zimFileInfo{
isDir: false,
modTime: time.Time{},
mode: 0,
name: filename,
size: size,
},
reader: contentReader,
}
return zimFile, zimFile.fileInfo, nil
}
// Read HTML file and inject Edge scripts
data, err := io.ReadAll(contentReader)
if err != nil {
return nil, nil, err
}
injected, err := b.injectEdgeScriptTag(data)
if err != nil {
logger.Error(ctx, "could not inject edge script", logger.E(errors.WithStack(err)))
} else {
data = injected
}
zimFile := &zimFile{
@ -244,26 +222,114 @@ func (b *ZimBundle) renderURL(ctx context.Context, url string) (io.ReadCloser, o
modTime: time.Time{},
mode: 0,
name: filename,
size: int64(len(data)),
size: size,
},
buff: bytes.NewBuffer(data),
reader: io.NopCloser(bytes.NewBuffer(data)),
}
return zimFile, zimFile.fileInfo, nil
}
func (b *ZimBundle) renderDirectory(ctx context.Context, filename string) (io.ReadCloser, os.FileInfo, error) {
zr, err := b.openArchive()
if err != nil {
return nil, nil, errors.WithStack(err)
func (b *ZimBundle) searchEntryFromURL(ctx context.Context, url string) (zim.Entry, error) {
ctx = logger.With(ctx, logger.F("webURL", url))
logger.Debug(ctx, "searching entry namespace in local cache")
// Search URL namespace from cache
if namespace, found := b.urlNamespaceCache.Get(url); found {
logger.Debug(ctx, "found entry namespace in cache")
entry, err := b.reader.EntryWithURL(namespace, url)
if err != nil {
return nil, errors.WithStack(err)
}
return entry, nil
}
defer func() {
if err := zr.Close(); err != nil {
panic(errors.WithStack(err))
}
}()
// Try to access entry directly if the URL match the pattern <NS>/<URL>
urlParts := strings.SplitN(url, "/", 2)
if len(urlParts) == 2 && len(urlParts[0]) == 1 {
namespace := zim.Namespace(urlParts[0])
url = urlParts[1]
logger.Debug(
ctx, "trying to access entry directly",
logger.F("zimNamespace", namespace),
logger.F("zimURL", url),
)
entry, err := b.reader.EntryWithURL(namespace, url)
if err != nil && !errors.Is(err, zim.ErrNotFound) {
return nil, errors.WithStack(err)
}
if entry != nil {
b.urlNamespaceCache.Add(url, entry.Namespace())
return entry, nil
}
}
contentNamespaces := []zim.Namespace{
zim.V6NamespaceContent,
zim.V6NamespaceMetadata,
zim.V5NamespaceLayout,
zim.V5NamespaceArticle,
zim.V5NamespaceImageFile,
zim.V5NamespaceMetadata,
}
logger.Debug(
ctx, "make educated guesses about potential url namespace",
logger.F("zimNamespaces", contentNamespaces),
)
for _, ns := range contentNamespaces {
logger.Debug(
ctx, "trying to access entry directly",
logger.F("zimNamespace", ns),
logger.F("zimURL", url),
)
entry, err := b.reader.EntryWithURL(ns, url)
if err != nil && !errors.Is(err, zim.ErrNotFound) {
return nil, errors.WithStack(err)
}
if entry != nil {
b.urlNamespaceCache.Add(url, entry.Namespace())
return entry, nil
}
}
logger.Debug(ctx, "doing full entries scan")
var entry zim.Entry
iterator := b.reader.Entries()
for iterator.Next() {
current := iterator.Entry()
if current.FullURL() != url && current.URL() != url {
continue
}
entry = current
b.urlNamespaceCache.Add(url, entry.Namespace())
break
}
if err := iterator.Err(); err != nil {
return nil, errors.WithStack(err)
}
if entry == nil {
return nil, errors.WithStack(zim.ErrNotFound)
}
return entry, nil
}
func (b *ZimBundle) renderDirectory(ctx context.Context, filename string) (io.ReadCloser, os.FileInfo, error) {
zimFile := &zimFile{
fileInfo: &zimFileInfo{
isDir: true,
@ -272,55 +338,23 @@ func (b *ZimBundle) renderDirectory(ctx context.Context, filename string) (io.Re
name: filename,
size: 0,
},
buff: bytes.NewBuffer(nil),
reader: io.NopCloser(bytes.NewBuffer(nil)),
}
return zimFile, zimFile.fileInfo, nil
}
func (b *ZimBundle) renderRedirect(ctx context.Context, filename string, to string) (io.ReadCloser, os.FileInfo, error) {
logger.Debug(ctx, "rendering redirect", logger.F("url", to))
data := fmt.Sprintf(`
<html>
<head>
<meta http-equiv="refresh" content="0; url=/%s" />
</head>
</html>
`, to)
stat := &zimFileInfo{
isDir: false,
modTime: time.Time{},
mode: 0,
name: filename,
size: int64(len(data)),
func (b *ZimBundle) renderMainPage(ctx context.Context, filename string) (io.ReadCloser, os.FileInfo, error) {
if err := b.init(); err != nil {
return nil, nil, errors.WithStack(err)
}
buf := bytes.NewBuffer([]byte(data))
reader := ioutil.NopCloser(buf)
return reader, stat, nil
}
func (b *ZimBundle) redirectToMainPage(ctx context.Context, filename string) (io.ReadCloser, os.FileInfo, error) {
zr, err := b.openArchive()
main, err := b.reader.MainPage()
if err != nil {
return nil, nil, errors.WithStack(err)
}
defer func() {
if err := zr.Close(); err != nil {
panic(errors.WithStack(err))
}
}()
main, err := zr.MainPage()
if err != nil {
return nil, nil, errors.WithStack(err)
}
return b.renderRedirect(ctx, filename, main.FullURL())
return b.renderURL(ctx, main.FullURL())
}
func (b *ZimBundle) injectEdgeScriptTag(data []byte) ([]byte, error) {
@ -369,13 +403,29 @@ func (b *ZimBundle) injectEdgeScriptTag(data []byte) ([]byte, error) {
return buff.Bytes(), nil
}
func (b *ZimBundle) openArchive() (*zim.ZimReader, error) {
zm, err := zim.NewReader(b.archivePath)
if err != nil {
return nil, errors.Wrapf(err, "could not open '%v'", b.archivePath)
func (b *ZimBundle) init() error {
b.initOnce.Do(func() {
reader, err := zim.Open(b.archivePath)
if err != nil {
b.initErr = errors.Wrapf(err, "could not open '%v'", b.archivePath)
return
}
b.reader = reader
cache, err := lru.New[string, zim.Namespace](128)
if err != nil {
b.initErr = errors.Wrap(err, "could not initialize cache")
return
}
b.urlNamespaceCache = cache
})
if b.initErr != nil {
return errors.WithStack(b.initErr)
}
return zm, nil
return nil
}
func NewZimBundle(archivePath string) *ZimBundle {
@ -386,17 +436,30 @@ func NewZimBundle(archivePath string) *ZimBundle {
type zimFile struct {
fileInfo *zimFileInfo
buff *bytes.Buffer
reader io.ReadCloser
}
// Close implements fs.File.
func (f *zimFile) Close() error {
if err := f.reader.Close(); err != nil {
return errors.WithStack(err)
}
return nil
}
// Read implements fs.File.
func (f *zimFile) Read(d []byte) (int, error) {
return f.buff.Read(d)
n, err := f.reader.Read(d)
if err != nil {
if errors.Is(err, io.EOF) {
return n, err
}
return n, errors.WithStack(err)
}
return n, nil
}
// Stat implements fs.File.