diff --git a/go.mod b/go.mod index 8bd6f31..95b025a 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module forge.cadoles.com/arcad/edge -go 1.20 +go 1.21 require ( github.com/hashicorp/golang-lru/v2 v2.0.7 diff --git a/modd.conf b/modd.conf index 877a0c9..25a1b13 100644 --- a/modd.conf +++ b/modd.conf @@ -17,5 +17,5 @@ misc/client-sdk-testsuite/src/**/* } **/*.go { - prep: make GOTEST_ARGS="-short" test + # prep: make GOTEST_ARGS="-short" test } \ No newline at end of file diff --git a/pkg/app/option.go b/pkg/app/option.go new file mode 100644 index 0000000..8a8e8b1 --- /dev/null +++ b/pkg/app/option.go @@ -0,0 +1,36 @@ +package app + +import ( + "context" + + "github.com/pkg/errors" + "gitlab.com/wpetit/goweb/logger" +) + +type Options struct { + ModuleFactories []ServerModuleFactory + ErrorHandler func(ctx context.Context, err error) +} + +type OptionFunc func(opts *Options) + +func NewOptions(funcs ...OptionFunc) *Options { + opts := &Options{ + ModuleFactories: make([]ServerModuleFactory, 0), + ErrorHandler: func(ctx context.Context, err error) { + logger.Error(ctx, err.Error(), logger.E(errors.WithStack(err))) + }, + } + + for _, fn := range funcs { + fn(opts) + } + + return opts +} + +func WithModulesFactories(factories ...ServerModuleFactory) OptionFunc { + return func(opts *Options) { + opts.ModuleFactories = factories + } +} diff --git a/pkg/bundle/filesystem.go b/pkg/bundle/filesystem.go index bb3d8d7..3df6f81 100644 --- a/pkg/bundle/filesystem.go +++ b/pkg/bundle/filesystem.go @@ -3,7 +3,7 @@ package bundle import ( "bytes" "context" - "io/ioutil" + "io" "net/http" "os" "path" @@ -60,7 +60,7 @@ func (fs *FileSystem) Open(name string) (http.File, error) { file.files = files } else { - data, err := ioutil.ReadAll(readCloser) + data, err := io.ReadAll(readCloser) if err != nil { logger.Error(ctx, "could not read bundle file", logger.E(err)) diff --git a/pkg/bundle/zim/article.go b/pkg/bundle/zim/article.go deleted file mode 100644 index 303665a..0000000 --- a/pkg/bundle/zim/article.go +++ /dev/null @@ -1,283 +0,0 @@ -package zim - -import ( - "bytes" - "errors" - "fmt" - "io" - "io/ioutil" - "strings" - "sync" - - lru "github.com/hashicorp/golang-lru/v2" -) - -const ( - RedirectEntry uint16 = 0xffff - LinkTargetEntry = 0xfffe - DeletedEntry = 0xfffd -) - -var articlePool sync.Pool - -// the recent uncompressed blobs, mainly useful while indexing and asking -// for the same blob again and again -var bcache *lru.Cache[any, any] - -type Article struct { - // EntryType is a RedirectEntry/LinkTargetEntry/DeletedEntry or an idx - // pointing to ZimReader.mimeTypeList - EntryType uint16 - Title string - URLPtr uint64 - Namespace byte - url string - blob uint32 - cluster uint32 - z *ZimReader -} - -// convenient method to return the Article at URL index idx -func (z *ZimReader) ArticleAtURLIdx(idx uint32) (*Article, error) { - o, err := z.OffsetAtURLIdx(idx) - if err != nil { - return nil, err - } - return z.ArticleAt(o) -} - -// return the article main page if it exists -func (z *ZimReader) MainPage() (*Article, error) { - if z.mainPage == 0xffffffff { - return nil, nil - } - return z.ArticleAtURLIdx(z.mainPage) -} - -// get the article (Directory) pointed by the offset found in URLpos or Titlepos -func (z *ZimReader) ArticleAt(offset uint64) (*Article, error) { - a := articlePool.Get().(*Article) - err := z.FillArticleAt(a, offset) - return a, err -} - -// Fill an article with datas found at offset -func (z *ZimReader) FillArticleAt(a *Article, offset uint64) error { - a.z = z - a.URLPtr = offset - - mimeIdx, err := readInt16(z.bytesRangeAt(offset, offset+2)) - if err != nil { - return fmt.Errorf("can't read article %w", err) - } - a.EntryType = mimeIdx - - // Linktarget or Target Entry - if mimeIdx == LinkTargetEntry || mimeIdx == DeletedEntry { - // TODO - return nil - } - - s, err := z.bytesRangeAt(offset+3, offset+4) - if err != nil { - return err - } - a.Namespace = s[0] - - a.cluster, err = readInt32(z.bytesRangeAt(offset+8, offset+8+4)) - if err != nil { - return err - } - a.blob, err = readInt32(z.bytesRangeAt(offset+12, offset+12+4)) - if err != nil { - return err - } - - // Redirect - if mimeIdx == RedirectEntry { - // assume the url + title won't be longer than 2k - b, err := z.bytesRangeAt(offset+12, offset+12+2048) - if err != nil { - return nil - } - bbuf := bytes.NewBuffer(b) - a.url, err = bbuf.ReadString('\x00') - if err != nil { - return err - } - a.url = strings.TrimRight(a.url, "\x00") - - a.Title, err = bbuf.ReadString('\x00') - if err != nil { - return err - } - a.Title = strings.TrimRight(a.Title, "\x00") - return err - } - - b, err := z.bytesRangeAt(offset+16, offset+16+2048) - if err != nil { - return nil - } - bbuf := bytes.NewBuffer(b) - a.url, err = bbuf.ReadString('\x00') - if err != nil { - return err - } - - a.url = strings.TrimRight(string(a.url), "\x00") - - title, err := bbuf.ReadString('\x00') - if err != nil { - return err - } - title = strings.TrimRight(string(title), "\x00") - // This is a trick to force a copy and avoid retain of the full buffer - // mainly for indexing title reasons - if len(title) != 0 { - a.Title = title[0:1] + title[1:] - } - return nil -} - -// return the uncompressed data associated with this article -func (a *Article) Data() ([]byte, error) { - // ensure we have data to read - if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry { - return nil, nil - } - start, end, err := a.z.clusterOffsetsAtIdx(a.cluster) - if err != nil { - return nil, err - } - s, err := a.z.bytesRangeAt(start, start+1) - if err != nil { - return nil, err - } - compression := uint8(s[0]) - - // blob starts at offset, blob ends at offset - var bs, be uint32 - - // LZMA: 4, Zstandard: 5 - if compression == 4 || compression == 5 { - blobLookup := func() ([]byte, bool) { - if v, ok := bcache.Get(a.cluster); ok { - b := v.([]byte) - return b, ok - } - return nil, false - } - - var blob []byte - var ok bool - var dec io.ReadCloser - if blob, ok = blobLookup(); !ok { - b, err := a.z.bytesRangeAt(start+1, end+1) - if err != nil { - return nil, err - } - bbuf := bytes.NewBuffer(b) - switch compression { - case 5: - dec, err = NewZstdReader(bbuf) - - case 4: - dec, err = NewXZReader(bbuf) - } - if err != nil { - return nil, err - } - defer dec.Close() - // the decoded chunk are around 1MB - b, err = ioutil.ReadAll(dec) - if err != nil { - return nil, err - } - blob = make([]byte, len(b)) - copy(blob, b) - // TODO: 2 requests for the same blob could occure at the same time - bcache.Add(a.cluster, blob) - } else { - bi, ok := bcache.Get(a.cluster) - if !ok { - return nil, errors.New("not in cache anymore") - } - blob = bi.([]byte) - } - - bs, err = readInt32(blob[a.blob*4:a.blob*4+4], nil) - if err != nil { - return nil, err - } - be, err = readInt32(blob[a.blob*4+4:a.blob*4+4+4], nil) - if err != nil { - return nil, err - } - - // avoid retaining all the chunk - c := make([]byte, be-bs) - copy(c, blob[bs:be]) - return c, nil - - } else if compression == 0 || compression == 1 { - // uncompresssed - startPos := start + 1 - blobOffset := uint64(a.blob * 4) - - bs, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset, startPos+blobOffset+4)) - if err != nil { - return nil, err - } - - be, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset+4, startPos+blobOffset+4+4)) - if err != nil { - return nil, err - } - - return a.z.bytesRangeAt(startPos+uint64(bs), startPos+uint64(be)) - } - - return nil, errors.New("Unhandled compression") -} - -func (a *Article) MimeType() string { - if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry { - return "" - } - - return a.z.mimeTypeList[a.EntryType] -} - -// return the url prefixed by the namespace -func (a *Article) FullURL() string { - return string(a.Namespace) + "/" + a.url -} - -func (a *Article) String() string { - return fmt.Sprintf("Mime: 0x%x URL: [%s], Title: [%s], Cluster: 0x%x Blob: 0x%x", - a.EntryType, a.FullURL(), a.Title, a.cluster, a.blob) -} - -// RedirectIndex return the redirect index of RedirectEntry type article -// return an err if not a redirect entry -func (a *Article) RedirectIndex() (uint32, error) { - if a.EntryType != RedirectEntry { - return 0, errors.New("Not a RedirectEntry") - } - // We use the cluster to save the redirect index position for RedirectEntry type - return a.cluster, nil -} - -func (a *Article) blobOffsetsAtIdx(z *ZimReader) (start, end uint64) { - idx := a.blob - offset := z.clusterPtrPos + uint64(idx)*8 - start, err := readInt64(z.bytesRangeAt(offset, offset+8)) - if err != nil { - return - } - offset = z.clusterPtrPos + uint64(idx+1)*8 - end, _ = readInt64(z.bytesRangeAt(offset, offset+8)) - - return -} diff --git a/pkg/bundle/zim/blob_reader.go b/pkg/bundle/zim/blob_reader.go new file mode 100644 index 0000000..452b3c1 --- /dev/null +++ b/pkg/bundle/zim/blob_reader.go @@ -0,0 +1,8 @@ +package zim + +import "io" + +type BlobReader interface { + io.ReadCloser + Size() (int64, error) +} diff --git a/pkg/bundle/zim/compressed_blob_reader.go b/pkg/bundle/zim/compressed_blob_reader.go new file mode 100644 index 0000000..0695b12 --- /dev/null +++ b/pkg/bundle/zim/compressed_blob_reader.go @@ -0,0 +1,163 @@ +package zim + +import ( + "bytes" + "encoding/binary" + "io" + "os" + "sync" + + "github.com/pkg/errors" +) + +type CompressedBlobReader struct { + reader *Reader + decoderFactory BlobDecoderFactory + + clusterStartOffset uint64 + clusterEndOffset uint64 + blobIndex uint32 + blobSize int + readOffset uint64 + + loadCluster sync.Once + loadClusterErr error + + data []byte + closed bool +} + +// Size implements BlobReader. +func (r *CompressedBlobReader) Size() (int64, error) { + if err := r.loadClusterData(); err != nil { + return 0, errors.WithStack(err) + } + + return int64(len(r.data)), nil +} + +// Close implements io.ReadCloser. +func (r *CompressedBlobReader) Close() error { + clear(r.data) + r.closed = true + return nil +} + +// Read implements io.ReadCloser. +func (r *CompressedBlobReader) Read(p []byte) (int, error) { + if err := r.loadClusterData(); err != nil { + return 0, errors.WithStack(err) + } + + length := len(p) + remaining := len(r.data) - int(r.readOffset) + if length > remaining { + length = remaining + } + + chunk := make([]byte, length) + + copy(chunk, r.data[r.readOffset:int(r.readOffset)+length]) + copy(p, chunk) + + if length == remaining { + return length, io.EOF + } + + r.readOffset += uint64(length) + + return length, nil +} + +func (r *CompressedBlobReader) loadClusterData() error { + if r.closed { + return errors.WithStack(os.ErrClosed) + } + + r.loadCluster.Do(func() { + compressedData := make([]byte, r.clusterEndOffset-r.clusterStartOffset) + if err := r.reader.readRange(int64(r.clusterStartOffset+1), compressedData); err != nil { + r.loadClusterErr = errors.WithStack(err) + return + } + + blobBuffer := bytes.NewBuffer(compressedData) + + decoder, err := r.decoderFactory(blobBuffer) + if err != nil { + r.loadClusterErr = errors.WithStack(err) + return + } + + defer decoder.Close() + + uncompressedData, err := io.ReadAll(decoder) + if err != nil { + r.loadClusterErr = errors.WithStack(err) + return + } + + var ( + blobStart uint64 + blobEnd uint64 + ) + + if r.blobSize == 8 { + blobStart64, err := readUint64(uncompressedData[r.blobIndex*uint32(r.blobSize):r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize)], binary.LittleEndian) + if err != nil { + r.loadClusterErr = errors.WithStack(err) + return + } + + blobStart = blobStart64 + + blobEnd64, err := readUint64(uncompressedData[r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize):r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize)+uint32(r.blobSize)], binary.LittleEndian) + if err != nil { + r.loadClusterErr = errors.WithStack(err) + return + } + + blobEnd = blobEnd64 + } else { + blobStart32, err := readUint32(uncompressedData[r.blobIndex*uint32(r.blobSize):r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize)], binary.LittleEndian) + if err != nil { + r.loadClusterErr = errors.WithStack(err) + return + } + + blobStart = uint64(blobStart32) + + blobEnd32, err := readUint32(uncompressedData[r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize):r.blobIndex*uint32(r.blobSize)+uint32(r.blobSize)+uint32(r.blobSize)], binary.LittleEndian) + if err != nil { + r.loadClusterErr = errors.WithStack(err) + return + } + + blobEnd = uint64(blobEnd32) + } + + r.data = make([]byte, blobEnd-blobStart) + copy(r.data, uncompressedData[blobStart:blobEnd]) + }) + if r.loadClusterErr != nil { + return errors.WithStack(r.loadClusterErr) + } + + return nil +} + +type BlobDecoderFactory func(io.Reader) (io.ReadCloser, error) + +func NewCompressedBlobReader(reader *Reader, decoderFactory BlobDecoderFactory, clusterStartOffset, clusterEndOffset uint64, blobIndex uint32, blobSize int) *CompressedBlobReader { + return &CompressedBlobReader{ + reader: reader, + decoderFactory: decoderFactory, + clusterStartOffset: clusterStartOffset, + clusterEndOffset: clusterEndOffset, + blobIndex: blobIndex, + blobSize: blobSize, + readOffset: 0, + } +} + +var _ BlobReader = &UncompressedBlobReader{} diff --git a/pkg/bundle/zim/content_entry.go b/pkg/bundle/zim/content_entry.go new file mode 100644 index 0000000..3b48aba --- /dev/null +++ b/pkg/bundle/zim/content_entry.go @@ -0,0 +1,193 @@ +package zim + +import ( + "encoding/binary" + + "github.com/pkg/errors" +) + +type zimCompression int + +const ( + zimCompressionNoneZeno zimCompression = 0 + zimCompressionNone zimCompression = 1 + zimCompressionNoneZLib zimCompression = 2 + zimCompressionNoneBZip2 zimCompression = 3 + zimCompressionNoneXZ zimCompression = 4 + zimCompressionNoneZStandard zimCompression = 5 +) + +type ContentEntry struct { + *BaseEntry + mimeType string + clusterIndex uint32 + blobIndex uint32 +} + +func (e *ContentEntry) Compression() (int, error) { + clusterHeader, _, _, err := e.readClusterInfo() + if err != nil { + return 0, errors.WithStack(err) + } + + return int((clusterHeader << 4) >> 4), nil +} + +func (e *ContentEntry) MimeType() string { + return e.mimeType +} + +func (e *ContentEntry) Reader() (BlobReader, error) { + clusterHeader, clusterStartOffset, clusterEndOffset, err := e.readClusterInfo() + if err != nil { + return nil, errors.WithStack(err) + } + + compression := (clusterHeader << 4) >> 4 + extended := (clusterHeader<<3)>>7 == 1 + + blobSize := 4 + if extended { + blobSize = 8 + } + + switch compression { + + // Uncompressed blobs + case uint8(zimCompressionNoneZeno): + fallthrough + case uint8(zimCompressionNone): + startPos := clusterStartOffset + 1 + blobOffset := uint64(e.blobIndex * uint32(blobSize)) + + data := make([]byte, 2*blobSize) + if err := e.reader.readRange(int64(startPos+blobOffset), data); err != nil { + return nil, errors.WithStack(err) + } + + var ( + blobStart uint64 + blobEnd uint64 + ) + + if extended { + blobStart64, err := readUint64(data[0:blobSize], binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + blobStart = blobStart64 + + blobEnd64, err := readUint64(data[blobSize:blobSize*2], binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + blobEnd = uint64(blobEnd64) + } else { + blobStart32, err := readUint32(data[0:blobSize], binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + blobStart = uint64(blobStart32) + + blobEnd32, err := readUint32(data[blobSize:blobSize*2], binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + blobEnd = uint64(blobEnd32) + } + + return NewUncompressedBlobReader(e.reader, startPos+blobStart, startPos+blobEnd, blobSize), nil + + // Supported compression algorithms + case uint8(zimCompressionNoneXZ): + return NewXZBlobReader(e.reader, clusterStartOffset, clusterEndOffset, e.blobIndex, blobSize), nil + + case uint8(zimCompressionNoneZStandard): + return NewZStdBlobReader(e.reader, clusterStartOffset, clusterEndOffset, e.blobIndex, blobSize), nil + + // Unsupported compression algorithms + case uint8(zimCompressionNoneZLib): + fallthrough + case uint8(zimCompressionNoneBZip2): + fallthrough + default: + return nil, errors.Wrapf(ErrCompressionAlgorithmNotSupported, "unexpected compression algorithm '%d'", compression) + } +} + +func (e *ContentEntry) Redirect() (*ContentEntry, error) { + return e, nil +} + +func (e *ContentEntry) readClusterInfo() (uint8, uint64, uint64, error) { + startClusterOffset, clusterEndOffset, err := e.reader.getClusterOffsets(int(e.clusterIndex)) + if err != nil { + return 0, 0, 0, errors.WithStack(err) + } + + data := make([]byte, 1) + if err := e.reader.readRange(int64(startClusterOffset), data); err != nil { + return 0, 0, 0, errors.WithStack(err) + } + + clusterHeader := uint8(data[0]) + + return clusterHeader, startClusterOffset, clusterEndOffset, nil +} + +func (r *Reader) parseContentEntry(offset int64, base *BaseEntry) (*ContentEntry, error) { + entry := &ContentEntry{ + BaseEntry: base, + } + + data := make([]byte, 16) + if err := r.readRange(offset, data); err != nil { + return nil, errors.WithStack(err) + } + + mimeTypeIndex, err := readUint16(data[0:2], binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + if mimeTypeIndex >= uint16(len(r.mimeTypes)) { + return nil, errors.Errorf("mime type index '%d' greater than mime types length '%d'", mimeTypeIndex, len(r.mimeTypes)) + } + + entry.mimeType = r.mimeTypes[mimeTypeIndex] + + entry.namespace = Namespace(data[3:4]) + + clusterIndex, err := readUint32(data[8:12], binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.clusterIndex = clusterIndex + + blobIndex, err := readUint32(data[12:16], binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.blobIndex = blobIndex + + strs, _, err := r.readStringsAt(offset+16, 2, 1024) + if err != nil { + return nil, errors.WithStack(err) + } + + if len(strs) > 0 { + entry.url = strs[0] + } + + if len(strs) > 1 { + entry.title = strs[1] + } + + return entry, nil +} diff --git a/pkg/bundle/zim/entry.go b/pkg/bundle/zim/entry.go new file mode 100644 index 0000000..bd58d16 --- /dev/null +++ b/pkg/bundle/zim/entry.go @@ -0,0 +1,135 @@ +package zim + +import ( + "encoding/binary" + "fmt" + + "github.com/pkg/errors" +) + +type Entry interface { + Redirect() (*ContentEntry, error) + Namespace() Namespace + URL() string + FullURL() string + Title() string +} + +type BaseEntry struct { + mimeTypeIndex uint16 + namespace Namespace + url string + title string + reader *Reader +} + +func (e *BaseEntry) Namespace() Namespace { + return e.namespace +} + +func (e *BaseEntry) Title() string { + if e.title == "" { + return e.url + } + + return e.title +} + +func (e *BaseEntry) URL() string { + return e.url +} + +func (e *BaseEntry) FullURL() string { + return toFullURL(e.Namespace(), e.URL()) +} + +func (r *Reader) parseBaseEntry(offset int64) (*BaseEntry, error) { + entry := &BaseEntry{ + reader: r, + } + + data := make([]byte, 3) + if err := r.readRange(offset, data); err != nil { + return nil, errors.WithStack(err) + } + + mimeTypeIndex, err := readUint16(data[0:2], binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.mimeTypeIndex = mimeTypeIndex + entry.namespace = Namespace(data[2]) + + return entry, nil +} + +type RedirectEntry struct { + *BaseEntry + redirectIndex uint32 +} + +func (e *RedirectEntry) Redirect() (*ContentEntry, error) { + if e.redirectIndex >= uint32(len(e.reader.urlIndex)) { + return nil, errors.Wrapf(ErrInvalidIndex, "entry index '%d' out of bounds", e.redirectIndex) + } + + entryPtr := e.reader.urlIndex[e.redirectIndex] + entry, err := e.reader.parseEntryAt(int64(entryPtr)) + if err != nil { + return nil, errors.WithStack(err) + } + + entry, err = entry.Redirect() + if err != nil { + return nil, errors.WithStack(err) + } + + contentEntry, ok := entry.(*ContentEntry) + if !ok { + return nil, errors.WithStack(ErrInvalidRedirect) + } + + return contentEntry, nil +} + +func (r *Reader) parseRedirectEntry(offset int64, base *BaseEntry) (*RedirectEntry, error) { + entry := &RedirectEntry{ + BaseEntry: base, + } + + data := make([]byte, 4) + if err := r.readRange(offset+8, data); err != nil { + return nil, errors.WithStack(err) + } + + redirectIndex, err := readUint32(data, binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.redirectIndex = redirectIndex + + strs, _, err := r.readStringsAt(offset+12, 2, 1024) + if err != nil { + return nil, errors.WithStack(err) + } + + if len(strs) > 0 { + entry.url = strs[0] + } + + if len(strs) > 1 { + entry.title = strs[1] + } + + return entry, nil +} + +func toFullURL(ns Namespace, url string) string { + if ns == "\x00" { + return url + } + + return fmt.Sprintf("%s/%s", ns, url) +} diff --git a/pkg/bundle/zim/entry_iterator.go b/pkg/bundle/zim/entry_iterator.go new file mode 100644 index 0000000..13622fc --- /dev/null +++ b/pkg/bundle/zim/entry_iterator.go @@ -0,0 +1,46 @@ +package zim + +import "github.com/pkg/errors" + +type EntryIterator struct { + index int + entry Entry + err error + reader *Reader +} + +func (it *EntryIterator) Next() bool { + if it.err != nil { + return false + } + + entryCount := it.reader.EntryCount() + + if it.index >= int(entryCount-1) { + return false + } + + entry, err := it.reader.EntryAt(it.index) + if err != nil { + it.err = errors.WithStack(err) + + return false + } + + it.entry = entry + it.index++ + + return true +} + +func (it *EntryIterator) Err() error { + return it.err +} + +func (it *EntryIterator) Index() int { + return it.index - 1 +} + +func (it *EntryIterator) Entry() Entry { + return it.entry +} diff --git a/pkg/bundle/zim/error.go b/pkg/bundle/zim/error.go index 236a681..35f5bf8 100644 --- a/pkg/bundle/zim/error.go +++ b/pkg/bundle/zim/error.go @@ -2,4 +2,9 @@ package zim import "errors" -var ErrNotFound = errors.New("not found") +var ( + ErrInvalidIndex = errors.New("invalid index") + ErrNotFound = errors.New("not found") + ErrInvalidRedirect = errors.New("invalid redirect") + ErrCompressionAlgorithmNotSupported = errors.New("compression algorithm not supported") +) diff --git a/pkg/bundle/zim/favicon.go b/pkg/bundle/zim/favicon.go index 10ed048..d4e8776 100644 --- a/pkg/bundle/zim/favicon.go +++ b/pkg/bundle/zim/favicon.go @@ -2,8 +2,8 @@ package zim import "github.com/pkg/errors" -func (z *ZimReader) Favicon() (*Article, error) { - illustration, err := z.getMetadataIllustration() +func (r *Reader) Favicon() (*ContentEntry, error) { + illustration, err := r.getMetadataIllustration() if err != nil && !errors.Is(err, ErrNotFound) { return nil, errors.WithStack(err) } @@ -12,37 +12,54 @@ func (z *ZimReader) Favicon() (*Article, error) { return illustration, nil } - namespaces := []string{"-", "I"} - entryNames := []string{"favicon", "favicon.png"} + namespaces := []Namespace{V5NamespaceLayout, V5NamespaceImageFile} + urls := []string{"favicon", "favicon.png"} for _, ns := range namespaces { - for _, en := range entryNames { - article, err := z.GetPageNoIndex(ns + "/" + en) + for _, url := range urls { + entry, err := r.EntryWithURL(ns, url) if err != nil && !errors.Is(err, ErrNotFound) { return nil, errors.WithStack(err) } - if article != nil { - return article, nil + if errors.Is(err, ErrNotFound) { + continue } + + content, err := entry.Redirect() + if err != nil { + return nil, errors.WithStack(err) + } + + return content, nil } } return nil, errors.WithStack(ErrNotFound) } -func (z *ZimReader) getMetadataIllustration() (*Article, error) { - metadata, err := z.Metadata(MetadataIllustration96x96at2, MetadataIllustration48x48at1) +func (r *Reader) getMetadataIllustration() (*ContentEntry, error) { + keys := []MetadataKey{MetadataIllustration96x96at2, MetadataIllustration48x48at1} + + metadata, err := r.Metadata(keys...) if err != nil { return nil, errors.WithStack(err) } - if _, exists := metadata[MetadataIllustration96x96at2]; exists { - return z.GetPageNoIndex("M/" + string(MetadataIllustration96x96at2)) - } + for _, k := range keys { + if _, exists := metadata[k]; exists { + entry, err := r.EntryWithURL(V5NamespaceMetadata, string(k)) + if err != nil { + return nil, errors.WithStack(err) + } - if _, exists := metadata[MetadataIllustration48x48at1]; exists { - return z.GetPageNoIndex("M/" + string(MetadataIllustration48x48at1)) + content, err := entry.Redirect() + if err != nil { + return nil, errors.WithStack(err) + } + + return content, nil + } } return nil, errors.WithStack(ErrNotFound) diff --git a/pkg/bundle/zim/metadata.go b/pkg/bundle/zim/metadata.go index 469e89e..8b27e50 100644 --- a/pkg/bundle/zim/metadata.go +++ b/pkg/bundle/zim/metadata.go @@ -1,6 +1,8 @@ package zim import ( + "io" + "github.com/pkg/errors" ) @@ -40,7 +42,7 @@ var knownKeys = []MetadataKey{ } // Metadata returns a copy of the internal metadata map of the ZIM file. -func (z *ZimReader) Metadata(keys ...MetadataKey) (map[MetadataKey]string, error) { +func (r *Reader) Metadata(keys ...MetadataKey) (map[MetadataKey]string, error) { if len(keys) == 0 { keys = knownKeys } @@ -48,7 +50,7 @@ func (z *ZimReader) Metadata(keys ...MetadataKey) (map[MetadataKey]string, error metadata := make(map[MetadataKey]string) for _, key := range keys { - article, err := z.GetPageNoIndex("M/" + string(key)) + entry, err := r.EntryWithURL(V5NamespaceMetadata, string(key)) if err != nil { if errors.Is(err, ErrNotFound) { continue @@ -57,9 +59,19 @@ func (z *ZimReader) Metadata(keys ...MetadataKey) (map[MetadataKey]string, error return nil, errors.WithStack(err) } - data, err := article.Data() - if errors.Is(err, ErrNotFound) { - continue + content, err := entry.Redirect() + if err != nil { + return nil, errors.WithStack(err) + } + + reader, err := content.Reader() + if err != nil { + return nil, errors.WithStack(err) + } + + data, err := io.ReadAll(reader) + if err != nil { + return nil, errors.WithStack(err) } metadata[key] = string(data) diff --git a/pkg/bundle/zim/namespace.go b/pkg/bundle/zim/namespace.go new file mode 100644 index 0000000..db869db --- /dev/null +++ b/pkg/bundle/zim/namespace.go @@ -0,0 +1,23 @@ +package zim + +type Namespace string + +const ( + V6NamespaceContent Namespace = "C" + V6NamespaceMetadata Namespace = "M" + V6NamespaceWellKnown Namespace = "W" + V6NamespaceSearch Namespace = "X" +) + +const ( + V5NamespaceLayout Namespace = "-" + V5NamespaceArticle Namespace = "A" + V5NamespaceArticleMetadata Namespace = "B" + V5NamespaceImageFile Namespace = "I" + V5NamespaceImageText Namespace = "J" + V5NamespaceMetadata Namespace = "M" + V5NamespaceCategoryText Namespace = "U" + V5NamespaceCategoryArticleList Namespace = "V" + V5NamespaceCategoryPerArticle Namespace = "W" + V5NamespaceSearch Namespace = "X" +) diff --git a/pkg/bundle/zim/option.go b/pkg/bundle/zim/option.go new file mode 100644 index 0000000..ee0677f --- /dev/null +++ b/pkg/bundle/zim/option.go @@ -0,0 +1,30 @@ +package zim + +import "time" + +type Options struct { + URLCacheSize int + URLCacheTTL time.Duration + CacheSize int +} + +type OptionFunc func(opts *Options) + +func NewOptions(funcs ...OptionFunc) *Options { + funcs = append([]OptionFunc{ + WithCacheSize(2048), + }, funcs...) + + opts := &Options{} + for _, fn := range funcs { + fn(opts) + } + + return opts +} + +func WithCacheSize(size int) OptionFunc { + return func(opts *Options) { + opts.CacheSize = size + } +} diff --git a/pkg/bundle/zim/reader.go b/pkg/bundle/zim/reader.go new file mode 100644 index 0000000..560121b --- /dev/null +++ b/pkg/bundle/zim/reader.go @@ -0,0 +1,558 @@ +package zim + +import ( + "context" + "encoding/binary" + "fmt" + "io" + "os" + "strings" + + lru "github.com/hashicorp/golang-lru/v2" + "github.com/pkg/errors" + "gitlab.com/wpetit/goweb/logger" +) + +const zimFormatMagicNumber uint32 = 0x44D495A +const nullByte = '\x00' +const zimRedirect = 0xffff + +type Reader struct { + majorVersion uint16 + minorVersion uint16 + uuid string + entryCount uint32 + clusterCount uint32 + urlPtrPos uint64 + titlePtrPos uint64 + clusterPtrPos uint64 + mimeListPos uint64 + mainPage uint32 + layoutPage uint32 + checksumPos uint64 + + mimeTypes []string + urlIndex []uint64 + clusterIndex []uint64 + + cache *lru.Cache[string, Entry] + urls map[string]int + + rangeReader RangeReadCloser +} + +func (r *Reader) Version() (majorVersion, minorVersion uint16) { + return r.majorVersion, r.minorVersion +} + +func (r *Reader) EntryCount() uint32 { + return r.entryCount +} + +func (r *Reader) ClusterCount() uint32 { + return r.clusterCount +} + +func (r *Reader) UUID() string { + return r.uuid +} + +func (r *Reader) Close() error { + if err := r.rangeReader.Close(); err != nil { + return errors.WithStack(err) + } + + return nil +} + +func (r *Reader) MainPage() (Entry, error) { + if r.mainPage == 0xffffffff { + return nil, errors.WithStack(ErrNotFound) + } + + entry, err := r.EntryAt(int(r.mainPage)) + if err != nil { + return nil, errors.WithStack(ErrNotFound) + } + + return entry, nil +} + +func (r *Reader) Entries() *EntryIterator { + return &EntryIterator{ + reader: r, + } +} + +func (r *Reader) EntryAt(idx int) (Entry, error) { + if idx >= len(r.urlIndex) || idx < 0 { + return nil, errors.Wrapf(ErrInvalidIndex, "index '%d' out of bounds", idx) + } + + entryPtr := r.urlIndex[idx] + + entry, err := r.parseEntryAt(int64(entryPtr)) + if err != nil { + return nil, errors.WithStack(err) + } + + r.cacheEntry(entryPtr, entry) + + return entry, nil +} + +func (r *Reader) EntryWithFullURL(url string) (Entry, error) { + urlNum, exists := r.urls[url] + if !exists { + return nil, errors.WithStack(ErrNotFound) + } + + entry, err := r.EntryAt(urlNum) + if err != nil { + return nil, errors.WithStack(err) + } + + return entry, nil +} + +func (r *Reader) EntryWithURL(ns Namespace, url string) (Entry, error) { + fullURL := toFullURL(ns, url) + + entry, err := r.EntryWithFullURL(fullURL) + if err != nil { + return nil, errors.WithStack(err) + } + + return entry, nil +} + +func (r *Reader) EntryWithTitle(ns Namespace, title string) (Entry, error) { + entry, found := r.getEntryByTitleFromCache(ns, title) + if found { + logger.Debug(context.Background(), "found entry with title from cache", logger.F("entry", entry.FullURL())) + return entry, nil + } + + iterator := r.Entries() + + for iterator.Next() { + entry := iterator.Entry() + + if entry.Title() == title && entry.Namespace() == ns { + return entry, nil + } + } + if err := iterator.Err(); err != nil { + return nil, errors.WithStack(err) + } + + return nil, errors.WithStack(ErrNotFound) +} + +func (r *Reader) getURLCacheKey(fullURL string) string { + return "url:" + fullURL +} + +func (r *Reader) getTitleCacheKey(ns Namespace, title string) string { + return fmt.Sprintf("title:%s/%s", ns, title) +} + +func (r *Reader) cacheEntry(offset uint64, entry Entry) { + urlKey := r.getURLCacheKey(entry.FullURL()) + titleKey := r.getTitleCacheKey(entry.Namespace(), entry.Title()) + + _, urlFound := r.cache.Peek(urlKey) + _, titleFound := r.cache.Peek(titleKey) + + if urlFound && titleFound { + return + } + + r.cache.Add(urlKey, entry) + r.cache.Add(titleKey, entry) +} + +func (r *Reader) getEntryByTitleFromCache(namespace Namespace, title string) (Entry, bool) { + key := r.getTitleCacheKey(namespace, title) + return r.cache.Get(key) +} + +func (r *Reader) parse() error { + if err := r.parseHeader(); err != nil { + return errors.WithStack(err) + } + + if err := r.parseMimeTypes(); err != nil { + return errors.WithStack(err) + } + + if err := r.parseURLIndex(); err != nil { + return errors.WithStack(err) + } + + if err := r.parseClusterIndex(); err != nil { + return errors.WithStack(err) + } + + return nil +} + +func (r *Reader) parseHeader() error { + header := make([]byte, 80) + if err := r.readRange(0, header); err != nil { + return errors.WithStack(err) + } + + magicNumber, err := readUint32(header[0:4], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + if magicNumber != zimFormatMagicNumber { + return errors.Errorf("invalid zim magic number '%d'", magicNumber) + } + + majorVersion, err := readUint16(header[4:6], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + r.majorVersion = majorVersion + + minorVersion, err := readUint16(header[6:8], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + r.minorVersion = minorVersion + + if err := r.parseUUID(header[8:16]); err != nil { + return errors.WithStack(err) + } + + entryCount, err := readUint32(header[24:28], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + r.entryCount = entryCount + + clusterCount, err := readUint32(header[28:32], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + r.clusterCount = clusterCount + + urlPtrPos, err := readUint64(header[32:40], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + r.urlPtrPos = urlPtrPos + + titlePtrPos, err := readUint64(header[40:48], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + r.titlePtrPos = titlePtrPos + + clusterPtrPos, err := readUint64(header[48:56], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + r.clusterPtrPos = clusterPtrPos + + mimeListPos, err := readUint64(header[56:64], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + r.mimeListPos = mimeListPos + + mainPage, err := readUint32(header[64:68], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + r.mainPage = mainPage + + layoutPage, err := readUint32(header[68:72], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + r.layoutPage = layoutPage + + checksumPos, err := readUint64(header[72:80], binary.LittleEndian) + if err != nil { + return errors.WithStack(err) + } + + r.checksumPos = checksumPos + + return nil +} + +func (r *Reader) parseUUID(data []byte) error { + parts := make([]string, 0, 5) + + val32, err := readUint32(data[0:4], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + parts = append(parts, fmt.Sprintf("%08x", val32)) + + val16, err := readUint16(data[4:6], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + parts = append(parts, fmt.Sprintf("%04x", val16)) + + val16, err = readUint16(data[6:8], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + parts = append(parts, fmt.Sprintf("%04x", val16)) + + val16, err = readUint16(data[8:10], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + parts = append(parts, fmt.Sprintf("%04x", val16)) + + val32, err = readUint32(data[10:14], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + val16, err = readUint16(data[14:16], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + parts = append(parts, fmt.Sprintf("%x%x", val32, val16)) + + r.uuid = strings.Join(parts, "-") + + return nil +} + +func (r *Reader) parseMimeTypes() error { + mimeTypes := make([]string, 0) + offset := int64(r.mimeListPos) + read := int64(0) + var err error + var found []string + for { + found, read, err = r.readStringsAt(offset+read, 64, 1024) + if err != nil && !errors.Is(err, io.EOF) { + return errors.WithStack(err) + } + + if len(found) == 0 || found[0] == "" { + break + } + + mimeTypes = append(mimeTypes, found...) + } + + r.mimeTypes = mimeTypes + + return nil +} + +func (r *Reader) parseURLIndex() error { + urlIndex, err := r.parsePointerIndex(int64(r.urlPtrPos), int64(r.entryCount)) + if err != nil { + return errors.WithStack(err) + } + + r.urlIndex = urlIndex + + return nil +} + +func (r *Reader) parseClusterIndex() error { + clusterIndex, err := r.parsePointerIndex(int64(r.clusterPtrPos), int64(r.clusterCount+1)) + if err != nil { + return errors.WithStack(err) + } + + r.clusterIndex = clusterIndex + + return nil +} + +func (r *Reader) parseEntryAt(offset int64) (Entry, error) { + base, err := r.parseBaseEntry(offset) + if err != nil { + return nil, errors.WithStack(err) + } + + var entry Entry + + if base.mimeTypeIndex == zimRedirect { + entry, err = r.parseRedirectEntry(offset, base) + if err != nil { + return nil, errors.WithStack(err) + } + } else { + entry, err = r.parseContentEntry(offset, base) + if err != nil { + return nil, errors.WithStack(err) + } + } + + return entry, nil +} + +func (r *Reader) parsePointerIndex(startAddr int64, count int64) ([]uint64, error) { + index := make([]uint64, count) + + data := make([]byte, count*8) + if err := r.readRange(startAddr, data); err != nil { + return nil, errors.WithStack(err) + } + + for i := int64(0); i < count; i++ { + offset := i * 8 + ptr, err := readUint64(data[offset:offset+8], binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + index[i] = ptr + } + + return index, nil +} + +func (r *Reader) getClusterOffsets(clusterNum int) (uint64, uint64, error) { + if clusterNum > len(r.clusterIndex)-1 || clusterNum < 0 { + return 0, 0, errors.Wrapf(ErrInvalidIndex, "index '%d' out of bounds", clusterNum) + } + + return r.clusterIndex[clusterNum], r.clusterIndex[clusterNum+1] - 1, nil +} + +func (r *Reader) preload() error { + r.urls = make(map[string]int, r.entryCount) + + iterator := r.Entries() + for iterator.Next() { + entry := iterator.Entry() + r.urls[entry.FullURL()] = iterator.Index() + } + if err := iterator.Err(); err != nil { + return errors.WithStack(err) + } + + return nil +} + +func (r *Reader) readRange(offset int64, v []byte) error { + read, err := r.rangeReader.ReadAt(v, offset) + if err != nil { + return errors.WithStack(err) + } + + if read != len(v) { + return io.EOF + } + + return nil +} + +func (r *Reader) readStringsAt(offset int64, count int, bufferSize int) ([]string, int64, error) { + var sb strings.Builder + read := int64(0) + + values := make([]string, 0, count) + wasNullByte := false + + for { + data := make([]byte, bufferSize) + err := r.readRange(offset+read, data) + if err != nil && !errors.Is(err, io.EOF) { + return nil, read, errors.WithStack(err) + } + + for idx := 0; idx < len(data); idx++ { + d := data[idx] + if err := sb.WriteByte(d); err != nil { + return nil, read, errors.WithStack(err) + } + + read++ + + if d == nullByte { + if wasNullByte { + return values, read, nil + } + + wasNullByte = true + + str := strings.TrimRight(sb.String(), "\x00") + values = append(values, str) + + if len(values) == count || errors.Is(err, io.EOF) { + return values, read, nil + } + + sb.Reset() + } else { + wasNullByte = false + } + } + } +} + +type RangeReadCloser interface { + io.Closer + ReadAt(data []byte, offset int64) (n int, err error) +} + +func NewReader(rangeReader RangeReadCloser, funcs ...OptionFunc) (*Reader, error) { + opts := NewOptions(funcs...) + + cache, err := lru.New[string, Entry](opts.CacheSize) + if err != nil { + return nil, errors.WithStack(err) + } + + reader := &Reader{ + rangeReader: rangeReader, + cache: cache, + } + + if err := reader.parse(); err != nil { + return nil, errors.WithStack(err) + } + + if err := reader.preload(); err != nil { + return nil, errors.WithStack(err) + } + + return reader, nil +} + +func Open(path string, funcs ...OptionFunc) (*Reader, error) { + file, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm) + if err != nil { + return nil, errors.WithStack(err) + } + + reader, err := NewReader(file, funcs...) + if err != nil { + return nil, errors.WithStack(err) + } + + return reader, nil +} diff --git a/pkg/bundle/zim/reader_test.go b/pkg/bundle/zim/reader_test.go new file mode 100644 index 0000000..b55321b --- /dev/null +++ b/pkg/bundle/zim/reader_test.go @@ -0,0 +1,133 @@ +package zim + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/pkg/errors" + "gitlab.com/wpetit/goweb/logger" +) + +type readerTestCase struct { + UUID string `json:"uuid"` + EntryCount uint32 `json:"entryCount"` + Entries []struct { + Namespace Namespace `json:"namespace"` + URL string `json:"url"` + Size int64 `json:"size"` + Compression int `json:"compression"` + MimeType string `json:"mimeType"` + Title string `json:"title"` + } `json:"entries"` +} + +func TestReader(t *testing.T) { + if testing.Verbose() { + logger.SetLevel(logger.LevelDebug) + logger.SetFormat(logger.FormatHuman) + } + + files, err := filepath.Glob("testdata/*.zim") + if err != nil { + t.Fatalf("%+v", errors.WithStack(err)) + } + + for _, zf := range files { + testName := filepath.Base(zf) + testCase, err := loadZimFileTestCase(zf) + if err != nil { + t.Fatalf("%+v", errors.WithStack(err)) + } + + t.Run(testName, func(t *testing.T) { + reader, err := Open(zf) + if err != nil { + t.Fatalf("%+v", errors.WithStack(err)) + } + + defer func() { + if err := reader.Close(); err != nil { + t.Errorf("%+v", errors.WithStack(err)) + } + }() + + if e, g := testCase.UUID, reader.UUID(); e != g { + t.Errorf("reader.UUID(): expected '%s', got '%s'", e, g) + } + + if e, g := testCase.EntryCount, reader.EntryCount(); e != g { + t.Errorf("reader.EntryCount(): expected '%v', got '%v'", e, g) + } + + if testCase.Entries == nil { + return + } + + for _, entryTestCase := range testCase.Entries { + testName := fmt.Sprintf("Entry/%s/%s", entryTestCase.Namespace, entryTestCase.URL) + t.Run(testName, func(t *testing.T) { + entry, err := reader.EntryWithURL(entryTestCase.Namespace, entryTestCase.URL) + if err != nil { + t.Fatalf("%+v", errors.WithStack(err)) + } + + content, err := entry.Redirect() + if err != nil { + t.Errorf("%+v", errors.WithStack(err)) + } + + if e, g := entryTestCase.MimeType, content.MimeType(); e != g { + t.Errorf("content.MimeType(): expected '%v', got '%v'", e, g) + } + + if e, g := entryTestCase.Title, content.Title(); e != g { + t.Errorf("content.Title(): expected '%v', got '%v'", e, g) + } + + compression, err := content.Compression() + if err != nil { + t.Errorf("%+v", errors.WithStack(err)) + } + + if e, g := entryTestCase.Compression, compression; e != g { + t.Errorf("content.Compression(): expected '%v', got '%v'", e, g) + } + + contentReader, err := content.Reader() + if err != nil { + t.Errorf("%+v", errors.WithStack(err)) + } + + size, err := contentReader.Size() + if err != nil { + t.Errorf("%+v", errors.WithStack(err)) + } + + if e, g := entryTestCase.Size, size; e != g { + t.Errorf("content.Size(): expected '%v', got '%v'", e, g) + } + }) + } + }) + } +} + +func loadZimFileTestCase(zimFile string) (*readerTestCase, error) { + testCaseFile, _ := strings.CutSuffix(zimFile, ".zim") + + data, err := os.ReadFile(testCaseFile + ".json") + if err != nil { + return nil, errors.WithStack(err) + } + + testCase := &readerTestCase{} + if err := json.Unmarshal(data, testCase); err != nil { + return nil, errors.WithStack(err) + } + + return testCase, nil +} diff --git a/pkg/bundle/zim/testdata/beer.stackexchange.com_en_all_2023-05.json b/pkg/bundle/zim/testdata/beer.stackexchange.com_en_all_2023-05.json new file mode 100644 index 0000000..f83cb59 --- /dev/null +++ b/pkg/bundle/zim/testdata/beer.stackexchange.com_en_all_2023-05.json @@ -0,0 +1,14 @@ +{ + "uuid": "8d141c3b-115d-bf73-294a-ee3c2e6b97b0", + "entryCount": 6223, + "entries": [ + { + "namespace": "C", + "url": "users_page=9", + "compression": 5, + "size": 58646, + "mimeType": "text/html", + "title": "users_page=9" + } + ] +} \ No newline at end of file diff --git a/pkg/bundle/zim/testdata/beer.stackexchange.com_en_all_2023-05.zim b/pkg/bundle/zim/testdata/beer.stackexchange.com_en_all_2023-05.zim new file mode 100644 index 0000000..0fd2e28 Binary files /dev/null and b/pkg/bundle/zim/testdata/beer.stackexchange.com_en_all_2023-05.zim differ diff --git a/pkg/bundle/zim/testdata/cadoles.json b/pkg/bundle/zim/testdata/cadoles.json new file mode 100644 index 0000000..c9ea8a1 --- /dev/null +++ b/pkg/bundle/zim/testdata/cadoles.json @@ -0,0 +1,22 @@ +{ + "uuid": "cf81f094-d802-c790-b854-c74ad9701ddb", + "entryCount": 271, + "entries": [ + { + "namespace": "C", + "url": "blog/202206-ShowroomInnovation.jpg", + "compression": 1, + "size": 260260, + "mimeType": "image/jpeg", + "title": "blog/202206-ShowroomInnovation.jpg" + }, + { + "namespace": "C", + "url": "team/index.html", + "compression": 5, + "size": 93185, + "mimeType": "text/html", + "title": "Cadoles - Notre équipe" + } + ] +} \ No newline at end of file diff --git a/pkg/bundle/zim/testdata/cadoles.zim b/pkg/bundle/zim/testdata/cadoles.zim new file mode 100644 index 0000000..fac4912 Binary files /dev/null and b/pkg/bundle/zim/testdata/cadoles.zim differ diff --git a/pkg/bundle/zim/testdata/wikibooks_af_all_maxi_2023-06.json b/pkg/bundle/zim/testdata/wikibooks_af_all_maxi_2023-06.json new file mode 100644 index 0000000..1d9abf5 --- /dev/null +++ b/pkg/bundle/zim/testdata/wikibooks_af_all_maxi_2023-06.json @@ -0,0 +1,14 @@ +{ + "uuid": "ad4f406c-2021-2db8-c729-297568bbe376", + "entryCount": 330, + "entries": [ + { + "namespace": "M", + "url": "Illustration_48x48@1", + "compression": 5, + "size": 5365, + "mimeType": "text/plain", + "title": "Illustration_48x48@1" + } + ] +} \ No newline at end of file diff --git a/pkg/bundle/zim/tools.go b/pkg/bundle/zim/tools.go deleted file mode 100644 index 3469d2a..0000000 --- a/pkg/bundle/zim/tools.go +++ /dev/null @@ -1,43 +0,0 @@ -package zim - -import ( - "bytes" - "encoding/binary" -) - -// read a little endian uint64 -func readInt64(b []byte, err error) (v uint64, aerr error) { - if err != nil { - aerr = err - - return - } - buf := bytes.NewBuffer(b) - aerr = binary.Read(buf, binary.LittleEndian, &v) - return -} - -// read a little endian uint32 -func readInt32(b []byte, err error) (v uint32, aerr error) { - if err != nil { - aerr = err - return - } - buf := bytes.NewBuffer(b) - aerr = binary.Read(buf, binary.LittleEndian, &v) - - return -} - -// read a little endian uint32 -func readInt16(b []byte, err error) (v uint16, aerr error) { - if err != nil { - aerr = err - - return - } - buf := bytes.NewBuffer(b) - aerr = binary.Read(buf, binary.LittleEndian, &v) - - return -} diff --git a/pkg/bundle/zim/uncompressed_blob_reader.go b/pkg/bundle/zim/uncompressed_blob_reader.go new file mode 100644 index 0000000..5da60b7 --- /dev/null +++ b/pkg/bundle/zim/uncompressed_blob_reader.go @@ -0,0 +1,86 @@ +package zim + +import ( + "io" + "sync" + + "github.com/pkg/errors" +) + +type UncompressedBlobReader struct { + reader *Reader + blobStartOffset uint64 + blobEndOffset uint64 + blobSize int + readOffset int + + blobData []byte + loadBlobOnce sync.Once + loadBlobErr error +} + +// Size implements BlobReader. +func (r *UncompressedBlobReader) Size() (int64, error) { + return int64(r.blobEndOffset - r.blobStartOffset), nil +} + +// Close implements io.ReadCloser. +func (r *UncompressedBlobReader) Close() error { + clear(r.blobData) + return nil +} + +// Read implements io.ReadCloser. +func (r *UncompressedBlobReader) Read(p []byte) (n int, err error) { + blobData, err := r.loadBlob() + if err != nil { + return 0, errors.WithStack(err) + } + + chunkLength := len(p) + remaining := int(len(blobData) - r.readOffset) + if chunkLength > remaining { + chunkLength = remaining + } + + chunk := blobData[r.readOffset : r.readOffset+chunkLength] + r.readOffset += chunkLength + + copy(p, chunk) + + if chunkLength == remaining { + return chunkLength, io.EOF + } + + return chunkLength, nil +} + +func (r *UncompressedBlobReader) loadBlob() ([]byte, error) { + r.loadBlobOnce.Do(func() { + data := make([]byte, r.blobEndOffset-r.blobStartOffset) + err := r.reader.readRange(int64(r.blobStartOffset), data) + if err != nil { + r.loadBlobErr = errors.WithStack(err) + return + } + + r.blobData = data + }) + if r.loadBlobErr != nil { + return nil, errors.WithStack(r.loadBlobErr) + } + + return r.blobData, nil +} + +func NewUncompressedBlobReader(reader *Reader, blobStartOffset, blobEndOffset uint64, blobSize int) *UncompressedBlobReader { + return &UncompressedBlobReader{ + reader: reader, + blobStartOffset: blobStartOffset, + blobEndOffset: blobEndOffset, + blobSize: blobSize, + readOffset: 0, + } +} + +var _ BlobReader = &UncompressedBlobReader{} diff --git a/pkg/bundle/zim/util.go b/pkg/bundle/zim/util.go new file mode 100644 index 0000000..fd53347 --- /dev/null +++ b/pkg/bundle/zim/util.go @@ -0,0 +1,52 @@ +package zim + +import ( + "bytes" + "encoding/binary" + + "github.com/pkg/errors" +) + +// read a little endian uint64 +func readUint64(b []byte, order binary.ByteOrder) (uint64, error) { + var v uint64 + buf := bytes.NewBuffer(b) + if err := binary.Read(buf, order, &v); err != nil { + return 0, errors.WithStack(err) + } + + return v, nil +} + +// read a little endian uint32 +func readUint32(b []byte, order binary.ByteOrder) (uint32, error) { + var v uint32 + buf := bytes.NewBuffer(b) + if err := binary.Read(buf, order, &v); err != nil { + return 0, errors.WithStack(err) + } + + return v, nil +} + +// read a little endian uint16 +func readUint16(b []byte, order binary.ByteOrder) (uint16, error) { + var v uint16 + buf := bytes.NewBuffer(b) + if err := binary.Read(buf, order, &v); err != nil { + return 0, errors.WithStack(err) + } + + return v, nil +} + +// read a little endian uint8 +func readUint8(b []byte, order binary.ByteOrder) (uint8, error) { + var v uint8 + buf := bytes.NewBuffer(b) + if err := binary.Read(buf, order, &v); err != nil { + return 0, errors.WithStack(err) + } + + return v, nil +} diff --git a/pkg/bundle/zim/xz_blob_reader.go b/pkg/bundle/zim/xz_blob_reader.go new file mode 100644 index 0000000..420ab30 --- /dev/null +++ b/pkg/bundle/zim/xz_blob_reader.go @@ -0,0 +1,42 @@ +package zim + +import ( + "io" + + "github.com/pkg/errors" + "github.com/ulikunitz/xz" +) + +type XZBlobReader struct { + decoder *xz.Reader +} + +// Close implements io.ReadCloser. +func (r *XZBlobReader) Close() error { + return nil +} + +// Read implements io.ReadCloser. +func (r *XZBlobReader) Read(p []byte) (n int, err error) { + return r.decoder.Read(p) +} + +var _ io.ReadCloser = &XZBlobReader{} + +func NewXZBlobReader(reader *Reader, clusterStartOffset, clusterEndOffset uint64, blobIndex uint32, blobSize int) *CompressedBlobReader { + return NewCompressedBlobReader( + reader, + func(r io.Reader) (io.ReadCloser, error) { + decoder, err := xz.NewReader(r) + if err != nil { + return nil, errors.WithStack(err) + } + + return &XZBlobReader{decoder}, nil + }, + clusterStartOffset, + clusterEndOffset, + blobIndex, + blobSize, + ) +} diff --git a/pkg/bundle/zim/xz_reader.go b/pkg/bundle/zim/xz_reader.go deleted file mode 100644 index 48cdeab..0000000 --- a/pkg/bundle/zim/xz_reader.go +++ /dev/null @@ -1,23 +0,0 @@ -package zim - -import ( - "io" - - "github.com/ulikunitz/xz" -) - -type XZReader struct { - *xz.Reader -} - -func NewXZReader(r io.Reader) (*XZReader, error) { - dec, err := xz.NewReader(r) - if err != nil { - return nil, err - } - return &XZReader{dec}, nil -} - -func (xr *XZReader) Close() error { - return nil -} diff --git a/pkg/bundle/zim/zim.go b/pkg/bundle/zim/zim.go deleted file mode 100644 index 8d51510..0000000 --- a/pkg/bundle/zim/zim.go +++ /dev/null @@ -1,317 +0,0 @@ -package zim - -import ( - "bytes" - "fmt" - "io" - "os" - "strings" - "sync" - - lru "github.com/hashicorp/golang-lru/v2" - "github.com/pkg/errors" -) - -const ( - zimHeader = 72173914 -) - -// ZimReader keep tracks of everything related to ZIM reading -type ZimReader struct { - f *os.File - UUID uint32 - ArticleCount uint32 - clusterCount uint32 - urlPtrPos uint64 - titlePtrPos uint64 - clusterPtrPos uint64 - mimeListPos uint64 - mainPage uint32 - layoutPage uint32 - mimeTypeList []string -} - -// create a new zim reader -func NewReader(path string) (*ZimReader, error) { - f, err := os.Open(path) - if err != nil { - return nil, err - } - z := ZimReader{f: f, mainPage: 0xffffffff, layoutPage: 0xffffffff} - - articlePool = sync.Pool{ - New: func() interface{} { - return new(Article) - }, - } - // keep 4 latest uncompressed blobs, around 1M per blob - bcache, _ = lru.New[any, any](5) - - err = z.readFileHeaders() - return &z, err -} - -// Return an ordered list of mime types present in the ZIM file -func (z *ZimReader) MimeTypes() []string { - if len(z.mimeTypeList) != 0 { - return z.mimeTypeList - } - - var s []string - // assume mime list fit in 2k - b, err := z.bytesRangeAt(z.mimeListPos, z.mimeListPos+2048) - if err != nil { - return s - } - bbuf := bytes.NewBuffer(b) - - for { - line, err := bbuf.ReadBytes('\x00') - if err != nil && err != io.EOF { - return s - } - // a line of 1 is a line containing only \x00 and it's the marker for the - // end of mime types list - if len(line) == 1 { - break - } - s = append(s, strings.TrimRight(string(line), "\x00")) - } - z.mimeTypeList = s - return s -} - -// list all articles, using url index, contained in a zim file -// note that this is a slow implementation, a real iterator is faster -// you are not suppose to use this method on big zim files, use indexes -func (z *ZimReader) ListArticles() <-chan *Article { - ch := make(chan *Article, 10) - - go func() { - var idx uint32 - // starting at 1 to avoid "con" entry - var start uint32 = 1 - - for idx = start; idx < z.ArticleCount; idx++ { - art, err := z.ArticleAtURLIdx(idx) - if err != nil { - continue - } - - if art == nil { - // TODO: deal with redirect continue - } - ch <- art - } - close(ch) - }() - return ch -} - -// list all title pointer, Titles by position contained in a zim file -// Titles are pointers to URLpos index, useful for indexing cause smaller to store: uint32 -// note that this is a slow implementation, a real iterator is faster -// you are not suppose to use this method on big zim files prefer ListTitlesPtrIterator to build your index -func (z *ZimReader) ListTitlesPtr() <-chan uint32 { - ch := make(chan uint32, 10) - - go func() { - var pos uint64 - var count uint32 - - for pos = z.titlePtrPos; count < z.ArticleCount; pos += 4 { - idx, err := readInt32(z.bytesRangeAt(pos, pos+4)) - if err != nil { - continue - } - ch <- idx - count++ - } - close(ch) - }() - return ch -} - -// list all title pointer, Titles by position contained in a zim file -// Titles are pointers to URLpos index, usefull for indexing cause smaller to store: uint32 -func (z *ZimReader) ListTitlesPtrIterator(cb func(uint32)) { - var count uint32 - for pos := z.titlePtrPos; count < z.ArticleCount; pos += 4 { - idx, err := readInt32(z.bytesRangeAt(pos, pos+4)) - if err != nil { - continue - } - cb(idx) - count++ - } -} - -// return the article at the exact url not using any index -func (z *ZimReader) GetPageNoIndex(url string) (*Article, error) { - // starting at 1 to avoid "con" entry - var start uint32 - stop := z.ArticleCount - - a := new(Article) - - for { - pos := (start + stop) / 2 - - offset, err := z.OffsetAtURLIdx(pos) - if err != nil { - return nil, err - } - err = z.FillArticleAt(a, offset) - if err != nil { - return nil, err - } - - if a.FullURL() == url { - return a, nil - } - - if a.FullURL() > url { - stop = pos - } else { - start = pos - } - if stop-start == 1 { - break - } - - } - return nil, errors.WithStack(ErrNotFound) -} - -// get the offset pointing to Article at pos in the URL idx -func (z *ZimReader) OffsetAtURLIdx(idx uint32) (uint64, error) { - offset := z.urlPtrPos + uint64(idx)*8 - return readInt64(z.bytesRangeAt(offset, offset+8)) -} - -// Close & cleanup the zimreader -func (z *ZimReader) Close() error { - return z.f.Close() -} - -func (z *ZimReader) String() string { - fi, err := z.f.Stat() - if err != nil { - return "corrupted zim" - } - return fmt.Sprintf("Size: %d, ArticleCount: %d urlPtrPos: 0x%x titlePtrPos: 0x%x mimeListPos: 0x%x clusterPtrPos: 0x%x\nMimeTypes: %v", - fi.Size(), z.ArticleCount, z.urlPtrPos, z.titlePtrPos, z.mimeListPos, z.clusterPtrPos, z.MimeTypes()) -} - -// getBytesRangeAt returns bytes from start to end -// it's needed to abstract mmap usages rather than read directly on the mmap slices -func (z *ZimReader) bytesRangeAt(start, end uint64) ([]byte, error) { - buf := make([]byte, end-start) - n, err := z.f.ReadAt(buf, int64(start)) - if err != nil { - return nil, fmt.Errorf("can't read bytes %w", err) - } - - if n != int(end-start) { - return nil, errors.New("can't read enough bytes") - } - - return buf, nil -} - -// populate the ZimReader structs with headers -func (z *ZimReader) readFileHeaders() error { - // checking for file type - v, err := readInt32(z.bytesRangeAt(0, 0+4)) - if err != nil || v != zimHeader { - return errors.New("not a ZIM file") - } - - // checking for version - v, err = readInt32(z.bytesRangeAt(4, 4+4)) - if err != nil { - return errors.Wrap(err, "could not read file version") - } - - // checking for articles count - v, err = readInt32(z.bytesRangeAt(8, 16)) - if err != nil { - return err - } - z.UUID = v - - // checking for articles count - v, err = readInt32(z.bytesRangeAt(24, 24+4)) - if err != nil { - return err - } - z.ArticleCount = v - - // checking for cluster count - v, err = readInt32(z.bytesRangeAt(28, 28+4)) - if err != nil { - return err - } - z.clusterCount = v - - // checking for urlPtrPos - vb, err := readInt64(z.bytesRangeAt(32, 32+8)) - if err != nil { - return err - } - z.urlPtrPos = vb - - // checking for titlePtrPos - vb, err = readInt64(z.bytesRangeAt(40, 40+8)) - if err != nil { - return err - } - z.titlePtrPos = vb - - // checking for clusterPtrPos - vb, err = readInt64(z.bytesRangeAt(48, 48+8)) - if err != nil { - return err - } - z.clusterPtrPos = vb - - // checking for mimeListPos - vb, err = readInt64(z.bytesRangeAt(56, 56+8)) - if err != nil { - return err - } - z.mimeListPos = vb - - // checking for mainPage - v, err = readInt32(z.bytesRangeAt(64, 64+4)) - if err != nil { - return err - } - z.mainPage = v - - // checking for layoutPage - v, err = readInt32(z.bytesRangeAt(68, 68+4)) - if err != nil { - return err - } - z.layoutPage = v - - z.MimeTypes() - return nil -} - -// return start and end offsets for cluster at index idx -func (z *ZimReader) clusterOffsetsAtIdx(idx uint32) (start, end uint64, err error) { - offset := z.clusterPtrPos + (uint64(idx) * 8) - start, err = readInt64(z.bytesRangeAt(offset, offset+8)) - if err != nil { - return - } - offset = z.clusterPtrPos + (uint64(idx+1) * 8) - end, err = readInt64(z.bytesRangeAt(offset, offset+8)) - if err != nil { - return - } - end-- - return -} diff --git a/pkg/bundle/zim/zim_test.go b/pkg/bundle/zim/zim_test.go deleted file mode 100644 index 3c85463..0000000 --- a/pkg/bundle/zim/zim_test.go +++ /dev/null @@ -1,150 +0,0 @@ -package zim - -import ( - "log" - "testing" - - "github.com/pkg/errors" -) - -var Z *ZimReader - -func init() { - var err error - Z, err = NewReader("testdata/wikibooks_af_all_maxi_2023-06.zim") - if err != nil { - log.Panicf("Can't read %v", err) - } -} - -func TestOpen(t *testing.T) { - if Z.ArticleCount == 0 { - t.Errorf("No article found") - } -} - -func TestMime(t *testing.T) { - if len(Z.MimeTypes()) == 0 { - t.Errorf("No mime types found") - } -} - -func TestDisplayInfost(t *testing.T) { - info := Z.String() - if len(info) < 0 { - t.Errorf("Can't read infos") - } - t.Log(info) -} - -func TestURLAtIdx(t *testing.T) { - // addr 0 is a redirect - p, _ := Z.OffsetAtURLIdx(5) - a, _ := Z.ArticleAt(p) - if a == nil { - t.Errorf("Can't find 1st url") - } -} - -func TestDisplayArticle(t *testing.T) { - // addr 0 is a redirect - p, _ := Z.OffsetAtURLIdx(5) - a, _ := Z.ArticleAt(p) - if a == nil { - t.Errorf("Can't find 1st url") - } - - t.Log(a) -} - -func TestPageNoIndex(t *testing.T) { - a, _ := Z.GetPageNoIndex("A/Dracula:Capitol_1.html") - if a == nil { - t.Errorf("Can't find existing url") - } -} - -func TestListArticles(t *testing.T) { - if testing.Short() { - t.Skip("skipping test in short mode.") - } - - var i uint32 - - for a := range Z.ListArticles() { - i++ - t.Log(a.String()) - } - - if i == 0 { - t.Errorf("Can't find any urls") - } - - if i != Z.ArticleCount-1 { - t.Errorf("Can't find the exact ArticleCount urls %d vs %d", i, Z.ArticleCount) - } -} - -func TestMainPage(t *testing.T) { - a, _ := Z.MainPage() - if a == nil { - t.Errorf("Can't find the mainpage article") - } - - t.Log(a) -} - -func TestFavicon(t *testing.T) { - favicon, err := Z.Favicon() - if err != nil { - t.Errorf("%+v", errors.WithStack(err)) - } - if favicon == nil { - t.Errorf("Can't find the favicon article") - } -} - -func TestMetadata(t *testing.T) { - metadata, err := Z.Metadata() - if err != nil { - t.Errorf("%+v", errors.WithStack(err)) - } - if metadata == nil { - t.Errorf("Can't find the metadata") - } -} - -func TestData(t *testing.T) { - // addr 0 is a redirect - p, _ := Z.OffsetAtURLIdx(2) - a, _ := Z.ArticleAt(p) - b, _ := a.Data() - data := string(b) - if a.EntryType != RedirectEntry { - if len(data) == 0 { - t.Error("can't read data") - } - } - t.Log(a.String()) - t.Log(data) -} - -func BenchmarkArticleBytes(b *testing.B) { - // addr 0 is a redirect - p, _ := Z.OffsetAtURLIdx(5) - a, _ := Z.ArticleAt(p) - if a == nil { - b.Errorf("Can't find 1st url") - } - data, err := a.Data() - if err != nil { - b.Error(err) - } - - b.SetBytes(int64(len(data))) - b.ResetTimer() - for i := 0; i < b.N; i++ { - a.Data() - bcache.Purge() // prevent memiozing value - } -} diff --git a/pkg/bundle/zim/zstd_blob_reader.go b/pkg/bundle/zim/zstd_blob_reader.go new file mode 100644 index 0000000..ebc88cf --- /dev/null +++ b/pkg/bundle/zim/zstd_blob_reader.go @@ -0,0 +1,43 @@ +package zim + +import ( + "io" + + "github.com/klauspost/compress/zstd" + "github.com/pkg/errors" +) + +type ZstdBlobReader struct { + decoder *zstd.Decoder +} + +// Close implements io.ReadCloser. +func (r *ZstdBlobReader) Close() error { + r.decoder.Close() + return nil +} + +// Read implements io.ReadCloser. +func (r *ZstdBlobReader) Read(p []byte) (n int, err error) { + return r.decoder.Read(p) +} + +var _ io.ReadCloser = &ZstdBlobReader{} + +func NewZStdBlobReader(reader *Reader, clusterStartOffset, clusterEndOffset uint64, blobIndex uint32, blobSize int) *CompressedBlobReader { + return NewCompressedBlobReader( + reader, + func(r io.Reader) (io.ReadCloser, error) { + decoder, err := zstd.NewReader(r) + if err != nil { + return nil, errors.WithStack(err) + } + + return &ZstdBlobReader{decoder}, nil + }, + clusterStartOffset, + clusterEndOffset, + blobIndex, + blobSize, + ) +} diff --git a/pkg/bundle/zim/zstd_reader.go b/pkg/bundle/zim/zstd_reader.go deleted file mode 100644 index 284ac34..0000000 --- a/pkg/bundle/zim/zstd_reader.go +++ /dev/null @@ -1,26 +0,0 @@ -package zim - -import ( - "fmt" - "io" - - "github.com/klauspost/compress/zstd" -) - -type ZstdReader struct { - *zstd.Decoder -} - -func NewZstdReader(r io.Reader) (*ZstdReader, error) { - dec, err := zstd.NewReader(r) - if err != nil { - return nil, fmt.Errorf("can't read from zstd %w", err) - } - return &ZstdReader{dec}, nil -} - -func (zr *ZstdReader) Close() error { - zr.Decoder.Close() - - return nil -} diff --git a/pkg/bundle/zim_bundle.go b/pkg/bundle/zim_bundle.go index f47b884..8e2f90d 100644 --- a/pkg/bundle/zim_bundle.go +++ b/pkg/bundle/zim_bundle.go @@ -3,19 +3,18 @@ package bundle import ( "bytes" "context" - "fmt" "io" "io/fs" - "io/ioutil" "os" "path/filepath" - "strconv" "strings" + "sync" "time" "golang.org/x/net/html" "forge.cadoles.com/arcad/edge/pkg/bundle/zim" + lru "github.com/hashicorp/golang-lru/v2" "github.com/pkg/errors" "gitlab.com/wpetit/goweb/logger" "gopkg.in/yaml.v2" @@ -23,6 +22,12 @@ import ( type ZimBundle struct { archivePath string + + initOnce sync.Once + initErr error + + reader *zim.Reader + urlNamespaceCache *lru.Cache[string, zim.Namespace] } func (b *ZimBundle) File(filename string) (io.ReadCloser, os.FileInfo, error) { @@ -41,7 +46,7 @@ func (b *ZimBundle) File(filename string) (io.ReadCloser, os.FileInfo, error) { case "public": return b.renderDirectory(ctx, filename) case "public/index.html": - return b.redirectToMainPage(ctx, filename) + return b.renderMainPage(ctx, filename) default: return b.renderURL(ctx, filename) @@ -49,58 +54,16 @@ func (b *ZimBundle) File(filename string) (io.ReadCloser, os.FileInfo, error) { } func (b *ZimBundle) Dir(dirname string) ([]os.FileInfo, error) { - reader, err := b.openArchive() - if err != nil { - return nil, err - } - - defer func() { - if err := reader.Close(); err != nil { - panic(errors.WithStack(err)) - } - }() - files := make([]os.FileInfo, 0) - // ctx := context.Background() - - // for _, f := range reader.File { - // if !strings.HasPrefix(f.Name, dirname) { - // continue - // } - - // relPath, err := filepath.Rel(dirname, f.Name) - // if err != nil { - // return nil, errors.Wrap(err, "could not get relative path") - // } - - // logger.Debug( - // ctx, "checking file prefix", - // logger.F("dirname", dirname), - // logger.F("filename", f.Name), - // logger.F("relpath", relPath), - // ) - - // if relPath == filepath.Base(f.Name) { - // files = append(files, f.FileInfo()) - // } - // } - return files, nil } func (b *ZimBundle) renderFakeManifest(ctx context.Context) (io.ReadCloser, os.FileInfo, error) { - reader, err := b.openArchive() - if err != nil { + if err := b.init(); err != nil { return nil, nil, errors.WithStack(err) } - defer func() { - if err := reader.Close(); err != nil { - panic(errors.WithStack(err)) - } - }() - - metadata, err := reader.Metadata() + metadata, err := b.reader.Metadata() if err != nil { return nil, nil, errors.WithStack(err) } @@ -117,7 +80,7 @@ func (b *ZimBundle) renderFakeManifest(ctx context.Context) (io.ReadCloser, os.F manifest["id"] = strings.ToLower(replacer.Replace(name)) + ".zim.edge.app" } else { - manifest["id"] = strconv.FormatUint(uint64(reader.UUID), 10) + ".zim.edge.app" + manifest["id"] = b.reader.UUID() + ".zim.edge.app" } if title, exists := metadata[zim.MetadataTitle]; exists { @@ -130,7 +93,7 @@ func (b *ZimBundle) renderFakeManifest(ctx context.Context) (io.ReadCloser, os.F manifest["description"] = description } - favicon, err := reader.Favicon() + favicon, err := b.reader.Favicon() if err != nil && !errors.Is(err, zim.ErrNotFound) { return nil, nil, errors.WithStack(err) } @@ -165,7 +128,7 @@ func (b *ZimBundle) renderFakeManifest(ctx context.Context) (io.ReadCloser, os.F } buf := bytes.NewBuffer(data) - file := ioutil.NopCloser(buf) + file := io.NopCloser(buf) return file, stat, nil } @@ -180,62 +143,78 @@ func (b *ZimBundle) renderFakeServerMain(ctx context.Context) (io.ReadCloser, os } buf := bytes.NewBuffer(nil) - file := ioutil.NopCloser(buf) + file := io.NopCloser(buf) return file, stat, nil } func (b *ZimBundle) renderURL(ctx context.Context, url string) (io.ReadCloser, os.FileInfo, error) { - zr, err := b.openArchive() - if err != nil { + if err := b.init(); err != nil { return nil, nil, errors.WithStack(err) } - defer func() { - if err := zr.Close(); err != nil { - panic(errors.WithStack(err)) - } - }() - - filename := filepath.Base(url) url = strings.TrimPrefix(url, "public/") - article, err := zr.GetPageNoIndex(url) + entry, err := b.searchEntryFromURL(ctx, url) if err != nil { if errors.Is(err, zim.ErrNotFound) { - return nil, nil, errors.WithStack(fs.ErrNotExist) + return nil, nil, os.ErrNotExist } return nil, nil, errors.WithStack(err) } - if article.EntryType == zim.RedirectEntry { - redirectIndex, err := article.RedirectIndex() - if err != nil { - return nil, nil, errors.WithStack(err) - } + logger.Debug( + ctx, "found zim entry", + logger.F("webURL", url), + logger.F("zimFullURL", entry.FullURL()), + ) - ra, err := zr.ArticleAtURLIdx(redirectIndex) - if err != nil { - return nil, nil, errors.WithStack(err) - } - - return b.renderRedirect(ctx, filename, ra.FullURL()) - } - - data, err := article.Data() + content, err := entry.Redirect() if err != nil { return nil, nil, errors.WithStack(err) } - mimeType := article.MimeType() - if mimeType == "text/html" { - injected, err := b.injectEdgeScriptTag(data) - if err != nil { - logger.Error(ctx, "could not inject edge script", logger.E(errors.WithStack(err))) - } else { - data = injected + contentReader, err := content.Reader() + if err != nil { + return nil, nil, errors.WithStack(err) + } + + size, err := contentReader.Size() + if err != nil { + return nil, nil, errors.WithStack(err) + } + + filename := filepath.Base(url) + + mimeType := content.MimeType() + if mimeType != "text/html" { + zimFile := &zimFile{ + fileInfo: &zimFileInfo{ + isDir: false, + modTime: time.Time{}, + mode: 0, + name: filename, + size: size, + }, + reader: contentReader, } + + return zimFile, zimFile.fileInfo, nil + } + + // Read HTML file and inject Edge scripts + + data, err := io.ReadAll(contentReader) + if err != nil { + return nil, nil, err + } + + injected, err := b.injectEdgeScriptTag(data) + if err != nil { + logger.Error(ctx, "could not inject edge script", logger.E(errors.WithStack(err))) + } else { + data = injected } zimFile := &zimFile{ @@ -244,26 +223,86 @@ func (b *ZimBundle) renderURL(ctx context.Context, url string) (io.ReadCloser, o modTime: time.Time{}, mode: 0, name: filename, - size: int64(len(data)), + size: size, }, - buff: bytes.NewBuffer(data), + reader: io.NopCloser(bytes.NewBuffer(data)), } return zimFile, zimFile.fileInfo, nil } -func (b *ZimBundle) renderDirectory(ctx context.Context, filename string) (io.ReadCloser, os.FileInfo, error) { - zr, err := b.openArchive() - if err != nil { - return nil, nil, errors.WithStack(err) +func (b *ZimBundle) searchEntryFromURL(ctx context.Context, url string) (zim.Entry, error) { + ctx = logger.With(ctx, logger.F("webURL", url)) + + logger.Debug(ctx, "searching entry namespace in local cache") + + entry, err := b.reader.EntryWithFullURL(url) + if err != nil && !errors.Is(err, zim.ErrNotFound) { + return nil, errors.WithStack(err) } - defer func() { - if err := zr.Close(); err != nil { - panic(errors.WithStack(err)) - } - }() + if entry != nil { + return entry, nil + } + contentNamespaces := []zim.Namespace{ + zim.V6NamespaceContent, + zim.V6NamespaceMetadata, + zim.V5NamespaceLayout, + zim.V5NamespaceArticle, + zim.V5NamespaceImageFile, + zim.V5NamespaceMetadata, + } + + logger.Debug( + ctx, "make educated guesses about potential url namespace", + logger.F("zimNamespaces", contentNamespaces), + ) + + for _, ns := range contentNamespaces { + logger.Debug( + ctx, "trying to access entry directly", + logger.F("zimNamespace", ns), + logger.F("zimURL", url), + ) + + entry, err := b.reader.EntryWithURL(ns, url) + if err != nil && !errors.Is(err, zim.ErrNotFound) { + return nil, errors.WithStack(err) + } + + if entry != nil { + b.urlNamespaceCache.Add(url, entry.Namespace()) + return entry, nil + } + } + + logger.Debug(ctx, "doing full entries scan") + + iterator := b.reader.Entries() + for iterator.Next() { + current := iterator.Entry() + + if current.FullURL() != url && current.URL() != url { + continue + } + + entry = current + b.urlNamespaceCache.Add(url, entry.Namespace()) + break + } + if err := iterator.Err(); err != nil { + return nil, errors.WithStack(err) + } + + if entry == nil { + return nil, errors.WithStack(zim.ErrNotFound) + } + + return entry, nil +} + +func (b *ZimBundle) renderDirectory(ctx context.Context, filename string) (io.ReadCloser, os.FileInfo, error) { zimFile := &zimFile{ fileInfo: &zimFileInfo{ isDir: true, @@ -272,55 +311,23 @@ func (b *ZimBundle) renderDirectory(ctx context.Context, filename string) (io.Re name: filename, size: 0, }, - buff: bytes.NewBuffer(nil), + reader: io.NopCloser(bytes.NewBuffer(nil)), } return zimFile, zimFile.fileInfo, nil } -func (b *ZimBundle) renderRedirect(ctx context.Context, filename string, to string) (io.ReadCloser, os.FileInfo, error) { - logger.Debug(ctx, "rendering redirect", logger.F("url", to)) - - data := fmt.Sprintf(` - - - - - - `, to) - - stat := &zimFileInfo{ - isDir: false, - modTime: time.Time{}, - mode: 0, - name: filename, - size: int64(len(data)), +func (b *ZimBundle) renderMainPage(ctx context.Context, filename string) (io.ReadCloser, os.FileInfo, error) { + if err := b.init(); err != nil { + return nil, nil, errors.WithStack(err) } - buf := bytes.NewBuffer([]byte(data)) - reader := ioutil.NopCloser(buf) - - return reader, stat, nil -} - -func (b *ZimBundle) redirectToMainPage(ctx context.Context, filename string) (io.ReadCloser, os.FileInfo, error) { - zr, err := b.openArchive() + main, err := b.reader.MainPage() if err != nil { return nil, nil, errors.WithStack(err) } - defer func() { - if err := zr.Close(); err != nil { - panic(errors.WithStack(err)) - } - }() - - main, err := zr.MainPage() - if err != nil { - return nil, nil, errors.WithStack(err) - } - - return b.renderRedirect(ctx, filename, main.FullURL()) + return b.renderURL(ctx, main.FullURL()) } func (b *ZimBundle) injectEdgeScriptTag(data []byte) ([]byte, error) { @@ -369,13 +376,29 @@ func (b *ZimBundle) injectEdgeScriptTag(data []byte) ([]byte, error) { return buff.Bytes(), nil } -func (b *ZimBundle) openArchive() (*zim.ZimReader, error) { - zm, err := zim.NewReader(b.archivePath) - if err != nil { - return nil, errors.Wrapf(err, "could not open '%v'", b.archivePath) +func (b *ZimBundle) init() error { + b.initOnce.Do(func() { + reader, err := zim.Open(b.archivePath) + if err != nil { + b.initErr = errors.Wrapf(err, "could not open '%v'", b.archivePath) + return + } + + b.reader = reader + + cache, err := lru.New[string, zim.Namespace](128) + if err != nil { + b.initErr = errors.Wrap(err, "could not initialize cache") + return + } + + b.urlNamespaceCache = cache + }) + if b.initErr != nil { + return errors.WithStack(b.initErr) } - return zm, nil + return nil } func NewZimBundle(archivePath string) *ZimBundle { @@ -386,17 +409,30 @@ func NewZimBundle(archivePath string) *ZimBundle { type zimFile struct { fileInfo *zimFileInfo - buff *bytes.Buffer + reader io.ReadCloser } // Close implements fs.File. func (f *zimFile) Close() error { + if err := f.reader.Close(); err != nil { + return errors.WithStack(err) + } + return nil } // Read implements fs.File. func (f *zimFile) Read(d []byte) (int, error) { - return f.buff.Read(d) + n, err := f.reader.Read(d) + if err != nil { + if errors.Is(err, io.EOF) { + return n, err + } + + return n, errors.WithStack(err) + } + + return n, nil } // Stat implements fs.File. diff --git a/pkg/http/html5_fileserver.go b/pkg/http/html5_fileserver.go index 136cc98..5783ad3 100644 --- a/pkg/http/html5_fileserver.go +++ b/pkg/http/html5_fileserver.go @@ -27,7 +27,6 @@ func HTML5Fileserver(fs http.FileSystem) http.Handler { r.URL.Path = "/" handler.ServeHTTP(w, r) - return }