package zim import ( "context" "encoding/binary" "fmt" "io" "os" "strings" "sync" lru "github.com/hashicorp/golang-lru/v2" "github.com/pkg/errors" "gitlab.com/wpetit/goweb/logger" ) const zimFormatMagicNumber uint32 = 0x44D495A const nullByte = '\x00' const zimRedirect = 0xffff type Reader struct { majorVersion uint16 minorVersion uint16 uuid string entryCount uint32 clusterCount uint32 urlPtrPos uint64 titlePtrPos uint64 clusterPtrPos uint64 mimeListPos uint64 mainPage uint32 layoutPage uint32 checksumPos uint64 mimeTypes []string urlIndex []uint64 cache *lru.Cache[string, Entry] seeker io.ReadSeekCloser seekerLock sync.Mutex } func (r *Reader) Version() (majorVersion, minorVersion uint16) { return r.majorVersion, r.minorVersion } func (r *Reader) EntryCount() uint32 { return r.entryCount } func (r *Reader) ClusterCount() uint32 { return r.clusterCount } func (r *Reader) UUID() string { return r.uuid } func (r *Reader) Close() error { if err := r.seeker.Close(); err != nil { return errors.WithStack(err) } return nil } func (r *Reader) MainPage() (Entry, error) { if r.mainPage == 0xffffffff { return nil, errors.WithStack(ErrNotFound) } entry, err := r.EntryAt(int(r.mainPage)) if err != nil { return nil, errors.WithStack(ErrNotFound) } return entry, nil } func (r *Reader) Entries() *EntryIterator { return &EntryIterator{ reader: r, } } func (r *Reader) EntryAt(idx int) (Entry, error) { if idx >= len(r.urlIndex) || idx < 0 { return nil, errors.Wrapf(ErrInvalidEntryIndex, "index '%d' out of bounds", idx) } entryPtr := r.urlIndex[idx] entry, err := r.parseEntryAt(int64(entryPtr)) if err != nil { return nil, errors.WithStack(err) } r.cacheEntry(entryPtr, entry) return entry, nil } func (r *Reader) EntryWithURL(ns Namespace, url string) (Entry, error) { entry, found := r.getEntryByURLFromCache(toFullURL(ns, url)) if found { logger.Debug(context.Background(), "found entry with url from cache", logger.F("fullURL", entry.FullURL())) return entry, nil } iterator := r.Entries() for iterator.Next() { entry := iterator.Entry() if entry.Namespace() == ns && (entry.URL() == url || entry.FullURL() == url) { return entry, nil } } if err := iterator.Err(); err != nil { return nil, errors.WithStack(err) } return nil, errors.WithStack(ErrNotFound) } func (r *Reader) EntryWithTitle(ns Namespace, title string) (Entry, error) { entry, found := r.getEntryByTitleFromCache(ns, title) if found { logger.Debug(context.Background(), "found entry with title from cache", logger.F("entry", entry.FullURL())) return entry, nil } iterator := r.Entries() for iterator.Next() { entry := iterator.Entry() if entry.Title() == title && entry.Namespace() == ns { return entry, nil } } if err := iterator.Err(); err != nil { return nil, errors.WithStack(err) } return nil, errors.WithStack(ErrNotFound) } func (r *Reader) getURLCacheKey(fullURL string) string { return "url:" + fullURL } func (r *Reader) getTitleCacheKey(ns Namespace, title string) string { return fmt.Sprintf("title:%s/%s", ns, title) } func (r *Reader) cacheEntry(offset uint64, entry Entry) { urlKey := r.getURLCacheKey(entry.FullURL()) titleKey := r.getTitleCacheKey(entry.Namespace(), entry.Title()) _, urlFound := r.cache.Peek(urlKey) _, titleFound := r.cache.Peek(titleKey) if urlFound && titleFound { return } r.cache.Add(urlKey, entry) r.cache.Add(titleKey, entry) } func (r *Reader) getEntryByURLFromCache(fullURL string) (Entry, bool) { key := r.getURLCacheKey(fullURL) return r.cache.Get(key) } func (r *Reader) getEntryByTitleFromCache(namespace Namespace, title string) (Entry, bool) { key := r.getTitleCacheKey(namespace, title) return r.cache.Get(key) } func (r *Reader) parse() error { if err := r.parseHeader(); err != nil { return errors.WithStack(err) } if err := r.parseMimeTypes(); err != nil { return errors.WithStack(err) } if err := r.parseURLIndex(); err != nil { return errors.WithStack(err) } return nil } func (r *Reader) parseHeader() error { magicNumber, err := r.readUint32At(0) if err != nil { return errors.WithStack(err) } if magicNumber != zimFormatMagicNumber { return errors.Errorf("invalid zim magic number '%d'", magicNumber) } majorVersion, err := r.readUint16At(4) if err != nil { return errors.WithStack(err) } r.majorVersion = majorVersion minorVersion, err := r.readUint16At(6) if err != nil { return errors.WithStack(err) } r.minorVersion = minorVersion if err := r.parseUUID(); err != nil { return errors.WithStack(err) } entryCount, err := r.readUint32At(24) if err != nil { return errors.WithStack(err) } r.entryCount = entryCount clusterCount, err := r.readUint32At(28) if err != nil { return errors.WithStack(err) } r.clusterCount = clusterCount urlPtrPos, err := r.readUint64At(32) if err != nil { return errors.WithStack(err) } r.urlPtrPos = urlPtrPos titlePtrPos, err := r.readUint64At(40) if err != nil { return errors.WithStack(err) } r.titlePtrPos = titlePtrPos clusterPtrPos, err := r.readUint64At(48) if err != nil { return errors.WithStack(err) } r.clusterPtrPos = clusterPtrPos mimeListPos, err := r.readUint64At(56) if err != nil { return errors.WithStack(err) } r.mimeListPos = mimeListPos mainPage, err := r.readUint32At(64) if err != nil { return errors.WithStack(err) } r.mainPage = mainPage layoutPage, err := r.readUint32At(68) if err != nil { return errors.WithStack(err) } r.layoutPage = layoutPage checksumPos, err := r.readUint64At(72) if err != nil { return errors.WithStack(err) } r.checksumPos = checksumPos return nil } func (r *Reader) parseUUID() error { data := make([]byte, 16) if err := r.readRange(8, data); err != nil { return errors.WithStack(err) } parts := make([]string, 0, 5) val32, err := readUint32(data[0:4], binary.BigEndian) if err != nil { return errors.WithStack(err) } parts = append(parts, fmt.Sprintf("%08x", val32)) val16, err := readUint16(data[4:6], binary.BigEndian) if err != nil { return errors.WithStack(err) } parts = append(parts, fmt.Sprintf("%04x", val16)) val16, err = readUint16(data[6:8], binary.BigEndian) if err != nil { return errors.WithStack(err) } parts = append(parts, fmt.Sprintf("%04x", val16)) val16, err = readUint16(data[8:10], binary.BigEndian) if err != nil { return errors.WithStack(err) } parts = append(parts, fmt.Sprintf("%04x", val16)) val32, err = readUint32(data[10:14], binary.BigEndian) if err != nil { return errors.WithStack(err) } val16, err = readUint16(data[14:16], binary.BigEndian) if err != nil { return errors.WithStack(err) } parts = append(parts, fmt.Sprintf("%x%x", val32, val16)) r.uuid = strings.Join(parts, "-") return nil } func (r *Reader) parseMimeTypes() error { mimeTypes := make([]string, 0) offset := int64(r.mimeListPos) for { mimeType, read, err := r.readStringAt(offset) if err != nil { return errors.WithStack(err) } if mimeType == "" { break } mimeTypes = append(mimeTypes, mimeType) offset += read + 1 } r.mimeTypes = mimeTypes return nil } func (r *Reader) parseURLIndex() error { urlIndex, err := r.parseEntryIndex(int64(r.urlPtrPos)) if err != nil { return errors.WithStack(err) } r.urlIndex = urlIndex return nil } func (r *Reader) parseEntryAt(offset int64) (Entry, error) { base, err := r.parseBaseEntry(offset) if err != nil { return nil, errors.WithStack(err) } var entry Entry if base.mimeTypeIndex == zimRedirect { entry, err = r.parseRedirectEntry(offset, base) if err != nil { return nil, errors.WithStack(err) } } else { entry, err = r.parseContentEntry(offset, base) if err != nil { return nil, errors.WithStack(err) } } return entry, nil } func (r *Reader) parseEntryIndex(startAddr int64) ([]uint64, error) { index := make([]uint64, r.entryCount) data := make([]byte, 8) for i := int64(0); i < int64(r.entryCount); i++ { if err := r.readRange(startAddr+i*8, data); err != nil { return nil, errors.WithStack(err) } ptr, err := readUint64(data, binary.LittleEndian) if err != nil { return nil, errors.WithStack(err) } index[i] = ptr } return index, nil } func (r *Reader) getClusterOffsets(clusterIndex int) (uint64, uint64, error) { data := make([]byte, 8) startClusterPtrOffset := r.clusterPtrPos + (uint64(clusterIndex) * 8) if err := r.readRange(int64(startClusterPtrOffset), data); err != nil { return 0, 0, errors.WithStack(err) } startClusterOffset, err := readUint64(data, binary.LittleEndian) if err != nil { return 0, 0, errors.WithStack(err) } endClusterPtrOffset := r.clusterPtrPos + (uint64(clusterIndex+1) * 8) if err := r.readRange(int64(endClusterPtrOffset), data); err != nil { return 0, 0, errors.WithStack(err) } endClusterOffset, err := readUint64(data, binary.LittleEndian) if err != nil { return 0, 0, errors.WithStack(err) } endClusterOffset-- return startClusterOffset, endClusterOffset, nil } func (r *Reader) readRange(offset int64, v []byte) error { r.seekerLock.Lock() defer r.seekerLock.Unlock() if _, err := r.seeker.Seek(offset, io.SeekStart); err != nil { return errors.WithStack(err) } read, err := r.seeker.Read(v) if err != nil { return errors.WithStack(err) } if read != len(v) { return errors.New("could not read enough bytes") } return nil } func (r *Reader) readUint32At(offset int64) (uint32, error) { data := make([]byte, 4) if err := r.readRange(offset, data); err != nil { return 0, errors.WithStack(err) } value, err := readUint32(data, binary.LittleEndian) if err != nil { return 0, errors.WithStack(err) } return value, nil } func (r *Reader) readUint16At(offset int64) (uint16, error) { data := make([]byte, 2) if err := r.readRange(offset, data); err != nil { return 0, errors.WithStack(err) } value, err := readUint16(data, binary.LittleEndian) if err != nil { return 0, errors.WithStack(err) } return value, nil } func (r *Reader) readUint64At(offset int64) (uint64, error) { data := make([]byte, 8) if err := r.readRange(offset, data); err != nil { return 0, errors.WithStack(err) } value, err := readUint64(data, binary.LittleEndian) if err != nil { return 0, errors.WithStack(err) } return value, nil } func (r *Reader) readStringAt(offset int64) (string, int64, error) { data := make([]byte, 1) var sb strings.Builder read := int64(0) for { if err := r.readRange(offset+read, data); err != nil { return "", read, errors.WithStack(err) } if err := sb.WriteByte(data[0]); err != nil { return "", read, errors.WithStack(err) } if data[0] == nullByte { str := strings.TrimRight(sb.String(), "\x00") return str, read, nil } read++ } } func NewReader(seeker io.ReadSeekCloser, funcs ...OptionFunc) (*Reader, error) { opts := NewOptions(funcs...) cache, err := lru.New[string, Entry](opts.CacheSize) if err != nil { return nil, errors.WithStack(err) } reader := &Reader{ seeker: seeker, cache: cache, } if err := reader.parse(); err != nil { return nil, errors.WithStack(err) } return reader, nil } func Open(path string, funcs ...OptionFunc) (*Reader, error) { file, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm) if err != nil { return nil, errors.WithStack(err) } reader, err := NewReader(file, funcs...) if err != nil { return nil, errors.WithStack(err) } return reader, nil }