package zim import ( "encoding/binary" "fmt" "io" "os" "strings" lru "github.com/hashicorp/golang-lru/v2" "github.com/pkg/errors" ) const zimFormatMagicNumber uint32 = 0x44D495A const nullByte = '\x00' const zimRedirect = 0xffff type Reader struct { majorVersion uint16 minorVersion uint16 uuid string entryCount uint32 clusterCount uint32 urlPtrPos uint64 titlePtrPos uint64 clusterPtrPos uint64 mimeListPos uint64 mainPage uint32 layoutPage uint32 checksumPos uint64 mimeTypes []string urlIndex []uint64 urlCache *lru.Cache[string, uint64] titleCache *lru.Cache[string, uint64] seeker io.ReadSeekCloser } func (r *Reader) Version() (majorVersion, minorVersion uint16) { return r.majorVersion, r.minorVersion } func (r *Reader) EntryCount() uint32 { return r.entryCount } func (r *Reader) ClusterCount() uint32 { return r.clusterCount } func (r *Reader) UUID() string { return r.uuid } func (r *Reader) Entries() *EntryIterator { return &EntryIterator{ reader: r, } } func (r *Reader) EntryAt(idx int) (Entry, error) { if idx >= len(r.urlIndex) || idx < 0 { return nil, errors.Wrapf(ErrInvalidEntryIndex, "index '%d' out of bounds", idx) } entryPtr := r.urlIndex[idx] entry, err := r.parseEntryAt(int64(entryPtr)) if err != nil { return nil, errors.WithStack(err) } r.cacheEntry(entryPtr, entry) return entry, nil } func (r *Reader) EntryWithURL(ns Namespace, url string) (Entry, error) { offset, found := r.getEntryOffsetByURLFromCache(ns, url) if found { entry, err := r.parseEntryAt(int64(offset)) if err != nil { return nil, errors.WithStack(err) } return entry, nil } iterator := r.Entries() for iterator.Next() { entry := iterator.Entry() if entry.Namespace() == ns && entry.URL() == url { return entry, nil } } if err := iterator.Err(); err != nil { return nil, errors.WithStack(err) } return nil, errors.WithStack(ErrNotFound) } func (r *Reader) EntryWithTitle(title string) (Entry, error) { offset, found := r.getEntryOffsetByTitleFromCache(title) if found { entry, err := r.parseEntryAt(int64(offset)) if err != nil { return nil, errors.WithStack(err) } return entry, nil } iterator := r.Entries() for iterator.Next() { entry := iterator.Entry() if entry.Title() == title { return entry, nil } } if err := iterator.Err(); err != nil { return nil, errors.WithStack(err) } return nil, errors.WithStack(ErrNotFound) } func (r *Reader) getURLCacheKey(entry Entry) string { return fmt.Sprintf("%s/%s", entry.Namespace(), entry.URL()) } func (r *Reader) cacheEntry(offset uint64, entry Entry) { urlKey := r.getURLCacheKey(entry) r.urlCache.Add(urlKey, offset) r.titleCache.Add(entry.Title(), offset) } func (r *Reader) getEntryOffsetByURLFromCache(namespace Namespace, url string) (uint64, bool) { key := fmt.Sprintf("%s/%s", namespace, url) return r.urlCache.Get(key) } func (r *Reader) getEntryOffsetByTitleFromCache(title string) (uint64, bool) { return r.titleCache.Get(title) } func (r *Reader) parse() error { if err := r.parseHeader(); err != nil { return errors.WithStack(err) } if err := r.parseMimeTypes(); err != nil { return errors.WithStack(err) } if err := r.parseURLIndex(); err != nil { return errors.WithStack(err) } return nil } func (r *Reader) parseHeader() error { magicNumber, err := r.readUint32At(0) if err != nil { return errors.WithStack(err) } if magicNumber != zimFormatMagicNumber { return errors.Errorf("invalid zim magic number '%d'", magicNumber) } majorVersion, err := r.readUint16At(4) if err != nil { return errors.WithStack(err) } r.majorVersion = majorVersion minorVersion, err := r.readUint16At(6) if err != nil { return errors.WithStack(err) } r.minorVersion = minorVersion if err := r.parseUUID(); err != nil { return errors.WithStack(err) } entryCount, err := r.readUint32At(24) if err != nil { return errors.WithStack(err) } r.entryCount = entryCount clusterCount, err := r.readUint32At(28) if err != nil { return errors.WithStack(err) } r.clusterCount = clusterCount urlPtrPos, err := r.readUint64At(32) if err != nil { return errors.WithStack(err) } r.urlPtrPos = urlPtrPos titlePtrPos, err := r.readUint64At(40) if err != nil { return errors.WithStack(err) } r.titlePtrPos = titlePtrPos clusterPtrPos, err := r.readUint64At(48) if err != nil { return errors.WithStack(err) } r.clusterPtrPos = clusterPtrPos mimeListPos, err := r.readUint64At(56) if err != nil { return errors.WithStack(err) } r.mimeListPos = mimeListPos mainPage, err := r.readUint32At(64) if err != nil { return errors.WithStack(err) } r.mainPage = mainPage layoutPage, err := r.readUint32At(68) if err != nil { return errors.WithStack(err) } r.layoutPage = layoutPage checksumPos, err := r.readUint64At(72) if err != nil { return errors.WithStack(err) } r.checksumPos = checksumPos return nil } func (r *Reader) parseUUID() error { data := make([]byte, 16) if err := r.readRange(8, data); err != nil { return errors.WithStack(err) } parts := make([]string, 0, 5) val32, err := readUint32(data[0:4], binary.BigEndian) if err != nil { return errors.WithStack(err) } parts = append(parts, fmt.Sprintf("%08x", val32)) val16, err := readUint16(data[4:6], binary.BigEndian) if err != nil { return errors.WithStack(err) } parts = append(parts, fmt.Sprintf("%04x", val16)) val16, err = readUint16(data[6:8], binary.BigEndian) if err != nil { return errors.WithStack(err) } parts = append(parts, fmt.Sprintf("%04x", val16)) val16, err = readUint16(data[8:10], binary.BigEndian) if err != nil { return errors.WithStack(err) } parts = append(parts, fmt.Sprintf("%04x", val16)) val32, err = readUint32(data[10:14], binary.BigEndian) if err != nil { return errors.WithStack(err) } val16, err = readUint16(data[14:16], binary.BigEndian) if err != nil { return errors.WithStack(err) } parts = append(parts, fmt.Sprintf("%x%x", val32, val16)) r.uuid = strings.Join(parts, "-") return nil } func (r *Reader) parseMimeTypes() error { mimeTypes := make([]string, 0) offset := int64(r.mimeListPos) for { mimeType, read, err := r.readStringAt(offset) if err != nil { return errors.WithStack(err) } if mimeType == "" { break } mimeTypes = append(mimeTypes, mimeType) offset += read + 1 } r.mimeTypes = mimeTypes return nil } func (r *Reader) parseURLIndex() error { urlIndex, err := r.parseEntryIndex(int64(r.urlPtrPos)) if err != nil { return errors.WithStack(err) } r.urlIndex = urlIndex return nil } func (r *Reader) parseEntryAt(offset int64) (Entry, error) { base, err := r.parseBaseEntry(offset) if err != nil { return nil, errors.WithStack(err) } var entry Entry if base.mimeTypeIndex == zimRedirect { entry, err = r.parseRedirectEntry(offset, base) if err != nil { return nil, errors.WithStack(err) } } else { entry, err = r.parseContentEntry(offset, base) if err != nil { return nil, errors.WithStack(err) } } return entry, nil } func (r *Reader) parseEntryIndex(startAddr int64) ([]uint64, error) { index := make([]uint64, r.entryCount) data := make([]byte, 8) for i := int64(0); i < int64(r.entryCount); i++ { if err := r.readRange(startAddr+i*8, data); err != nil { return nil, errors.WithStack(err) } ptr, err := readUint64(data, binary.LittleEndian) if err != nil { return nil, errors.WithStack(err) } index[i] = ptr } return index, nil } func (r *Reader) readRange(offset int64, v []byte) error { if _, err := r.seeker.Seek(offset, io.SeekStart); err != nil { return errors.WithStack(err) } read, err := r.seeker.Read(v) if err != nil { return errors.WithStack(err) } if read != len(v) { return errors.New("could not read enough bytes") } return nil } func (r *Reader) readUint32At(offset int64) (uint32, error) { data := make([]byte, 4) if err := r.readRange(offset, data); err != nil { return 0, errors.WithStack(err) } value, err := readUint32(data, binary.LittleEndian) if err != nil { return 0, errors.WithStack(err) } return value, nil } func (r *Reader) readUint16At(offset int64) (uint16, error) { data := make([]byte, 2) if err := r.readRange(offset, data); err != nil { return 0, errors.WithStack(err) } value, err := readUint16(data, binary.LittleEndian) if err != nil { return 0, errors.WithStack(err) } return value, nil } func (r *Reader) readUint64At(offset int64) (uint64, error) { data := make([]byte, 8) if err := r.readRange(offset, data); err != nil { return 0, errors.WithStack(err) } value, err := readUint64(data, binary.LittleEndian) if err != nil { return 0, errors.WithStack(err) } return value, nil } func (r *Reader) readStringAt(offset int64) (string, int64, error) { data := make([]byte, 1) var sb strings.Builder read := int64(0) for { if err := r.readRange(offset+read, data); err != nil { return "", read, errors.WithStack(err) } if err := sb.WriteByte(data[0]); err != nil { return "", read, errors.WithStack(err) } if data[0] == nullByte { str := strings.TrimRight(sb.String(), "\x00") return str, read, nil } read++ } } func (r *Reader) Close() error { if err := r.seeker.Close(); err != nil { return errors.WithStack(err) } return nil } func NewReader(seeker io.ReadSeekCloser, funcs ...OptionFunc) (*Reader, error) { opts := NewOptions(funcs...) urlCache, err := lru.New[string, uint64](opts.URLCacheSize) if err != nil { return nil, errors.WithStack(err) } titleCache, err := lru.New[string, uint64](opts.TitleCacheSize) if err != nil { return nil, errors.WithStack(err) } reader := &Reader{ seeker: seeker, urlCache: urlCache, titleCache: titleCache, } if err := reader.parse(); err != nil { return nil, errors.WithStack(err) } return reader, nil } func Open(path string, funcs ...OptionFunc) (*Reader, error) { file, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm) if err != nil { return nil, errors.WithStack(err) } reader, err := NewReader(file, funcs...) if err != nil { return nil, errors.WithStack(err) } return reader, nil }