package zim import ( "bytes" "errors" "fmt" "io" "io/ioutil" "strings" "sync" lru "github.com/hashicorp/golang-lru/v2" ) const ( RedirectEntry uint16 = 0xffff LinkTargetEntry = 0xfffe DeletedEntry = 0xfffd ) var articlePool sync.Pool // the recent uncompressed blobs, mainly useful while indexing and asking // for the same blob again and again var bcache *lru.Cache[any, any] type Article struct { // EntryType is a RedirectEntry/LinkTargetEntry/DeletedEntry or an idx // pointing to ZimReader.mimeTypeList EntryType uint16 Title string URLPtr uint64 Namespace byte url string blob uint32 cluster uint32 z *ZimReader } // convenient method to return the Article at URL index idx func (z *ZimReader) ArticleAtURLIdx(idx uint32) (*Article, error) { o, err := z.OffsetAtURLIdx(idx) if err != nil { return nil, err } return z.ArticleAt(o) } // return the article main page if it exists func (z *ZimReader) MainPage() (*Article, error) { if z.mainPage == 0xffffffff { return nil, nil } return z.ArticleAtURLIdx(z.mainPage) } // get the article (Directory) pointed by the offset found in URLpos or Titlepos func (z *ZimReader) ArticleAt(offset uint64) (*Article, error) { a := articlePool.Get().(*Article) err := z.FillArticleAt(a, offset) return a, err } // Fill an article with datas found at offset func (z *ZimReader) FillArticleAt(a *Article, offset uint64) error { a.z = z a.URLPtr = offset mimeIdx, err := readInt16(z.bytesRangeAt(offset, offset+2)) if err != nil { return fmt.Errorf("can't read article %w", err) } a.EntryType = mimeIdx // Linktarget or Target Entry if mimeIdx == LinkTargetEntry || mimeIdx == DeletedEntry { // TODO return nil } s, err := z.bytesRangeAt(offset+3, offset+4) if err != nil { return err } a.Namespace = s[0] a.cluster, err = readInt32(z.bytesRangeAt(offset+8, offset+8+4)) if err != nil { return err } a.blob, err = readInt32(z.bytesRangeAt(offset+12, offset+12+4)) if err != nil { return err } // Redirect if mimeIdx == RedirectEntry { // assume the url + title won't be longer than 2k b, err := z.bytesRangeAt(offset+12, offset+12+2048) if err != nil { return nil } bbuf := bytes.NewBuffer(b) a.url, err = bbuf.ReadString('\x00') if err != nil { return err } a.url = strings.TrimRight(a.url, "\x00") a.Title, err = bbuf.ReadString('\x00') if err != nil { return err } a.Title = strings.TrimRight(a.Title, "\x00") return err } b, err := z.bytesRangeAt(offset+16, offset+16+2048) if err != nil { return nil } bbuf := bytes.NewBuffer(b) a.url, err = bbuf.ReadString('\x00') if err != nil { return err } a.url = strings.TrimRight(string(a.url), "\x00") title, err := bbuf.ReadString('\x00') if err != nil { return err } title = strings.TrimRight(string(title), "\x00") // This is a trick to force a copy and avoid retain of the full buffer // mainly for indexing title reasons if len(title) != 0 { a.Title = title[0:1] + title[1:] } return nil } // return the uncompressed data associated with this article func (a *Article) Data() ([]byte, error) { // ensure we have data to read if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry { return nil, nil } start, end, err := a.z.clusterOffsetsAtIdx(a.cluster) if err != nil { return nil, err } s, err := a.z.bytesRangeAt(start, start+1) if err != nil { return nil, err } compression := uint8(s[0]) // blob starts at offset, blob ends at offset var bs, be uint32 // LZMA: 4, Zstandard: 5 if compression == 4 || compression == 5 { blobLookup := func() ([]byte, bool) { if v, ok := bcache.Get(a.cluster); ok { b := v.([]byte) return b, ok } return nil, false } var blob []byte var ok bool var dec io.ReadCloser if blob, ok = blobLookup(); !ok { b, err := a.z.bytesRangeAt(start+1, end+1) if err != nil { return nil, err } bbuf := bytes.NewBuffer(b) switch compression { case 5: dec, err = NewZstdReader(bbuf) case 4: dec, err = NewXZReader(bbuf) } if err != nil { return nil, err } defer dec.Close() // the decoded chunk are around 1MB b, err = ioutil.ReadAll(dec) if err != nil { return nil, err } blob = make([]byte, len(b)) copy(blob, b) // TODO: 2 requests for the same blob could occure at the same time bcache.Add(a.cluster, blob) } else { bi, ok := bcache.Get(a.cluster) if !ok { return nil, errors.New("not in cache anymore") } blob = bi.([]byte) } bs, err = readInt32(blob[a.blob*4:a.blob*4+4], nil) if err != nil { return nil, err } be, err = readInt32(blob[a.blob*4+4:a.blob*4+4+4], nil) if err != nil { return nil, err } // avoid retaining all the chunk c := make([]byte, be-bs) copy(c, blob[bs:be]) return c, nil } else if compression == 0 || compression == 1 { // uncompresssed startPos := start + 1 blobOffset := uint64(a.blob * 4) bs, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset, startPos+blobOffset+4)) if err != nil { return nil, err } be, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset+4, startPos+blobOffset+4+4)) if err != nil { return nil, err } return a.z.bytesRangeAt(startPos+uint64(bs), startPos+uint64(be)) } return nil, errors.New("Unhandled compression") } func (a *Article) MimeType() string { if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry { return "" } return a.z.mimeTypeList[a.EntryType] } // return the url prefixed by the namespace func (a *Article) FullURL() string { return string(a.Namespace) + "/" + a.url } func (a *Article) String() string { return fmt.Sprintf("Mime: 0x%x URL: [%s], Title: [%s], Cluster: 0x%x Blob: 0x%x", a.EntryType, a.FullURL(), a.Title, a.cluster, a.blob) } // RedirectIndex return the redirect index of RedirectEntry type article // return an err if not a redirect entry func (a *Article) RedirectIndex() (uint32, error) { if a.EntryType != RedirectEntry { return 0, errors.New("Not a RedirectEntry") } // We use the cluster to save the redirect index position for RedirectEntry type return a.cluster, nil } func (a *Article) blobOffsetsAtIdx(z *ZimReader) (start, end uint64) { idx := a.blob offset := z.clusterPtrPos + uint64(idx)*8 start, err := readInt64(z.bytesRangeAt(offset, offset+8)) if err != nil { return } offset = z.clusterPtrPos + uint64(idx+1)*8 end, _ = readInt64(z.bytesRangeAt(offset, offset+8)) return }