Source code for k1lib.selen

# AUTOGENERATED FILE! PLEASE DON'T EDIT HERE. EDIT THE SOURCE NOTEBOOKS INSTEAD
"""Nice website analyzer based on Selenium. Whenever I used Selenium
in the past, I've always have to find how to reach specific buttons,
what paragraphs to capture, things like that, all manually. That
really limits my usage of it. Recently, I've made the module ``kapi``,
capable of doing a bunch of DL stuff and feels like this is the right
time to make something on top of Selenium that is able to pull out the
main article, body, header, right/left bar automatically. After that, I
can grab the main text to put it into an embedding db to use it further
downstream. Example::

    from k1lib.imports import *

    browser = selen.getBrowser() # fires up the browser
    browser.get("https://en.wikipedia.org/wiki/Cheese")

    page = selen.Page.analyze(browser) # analyze everything. For a typical page, should take 8-14s total
    page.draw()             # run this inside a notebook to view all stages
    page.mainContents()     # return List[Element] of potential big pieces of the page, like header, footer, left/right bar, etc.

    elem = page.mainContent # grabs what it think is the most likely meats of the page, ignoring everything else
    elem.obj                # accesses the internal Selenium Element
    elem.obj.text           # grabs element's text content
    elem.obj.x              # grabs element's x location. Works with .y too
    elem.obj.w              # grabs element's width. Works with .h too
"""
import k1lib as k1, numpy as np; import k1lib.cli as cli; import k1lib.p5 as p5
from typing import List
by = k1.dep("selenium.webdriver.common.by")
__all__ = ["getBrowser", "Page", "Element"]
# dummy classes representing type information
class Meta: pass # obj:BaseElement, x, y, w, h, wh (5), type, path, %area of parent (8) # Meta
class Browser: pass                                                              # Browser
Stages = List[List[Meta]]                                                        # Browser

[docs]
def getBrowser() -> Browser:                                                     # getBrowser
    """Launches new browser and return object to manage"""                       # getBrowser
    return k1.dep("selenium.webdriver", "selenium==4.8.3").Chrome(k1.dep("webdriver_manager.chrome", "webdriver_manager==4.0.1").ChromeDriverManager().install()) # getBrowser

def getMeta(e:"Element") -> Meta:                                                # getMeta
    loc = e.location; size = e.size                                              # getMeta
    return [e, loc["x"], loc["y"], size["width"], size["height"], size["width"]*size["height"], None, "r", 0] # getMeta
def getChildren(parent:Meta, autoInc): # tries to discover children of this element that are somewhat reasonable # getChildren
    ogParent = parent                                                            # getChildren
    while True:                                                                  # getChildren
        pw = parent[3]; ph = parent[4]                                           # getChildren
        try: ch1 = parent[0].find_elements(by.By.XPATH, "*") | cli.apply(getMeta) | cli.deref() # "ch" for children # getChildren
        except: return [] # elements can go stale!                               # getChildren
        ch2 = ch1 | cli.filt("x>2000", 5) | cli.deref(); ch3 = ch2               # getChildren
        if len(ch3) == 0: return [] # short circuits                             # getChildren
        if len(ch3) > 1: break                                                   # getChildren
        parent = ch3[0] # child encompass nearly all of parent, so loop around more, because it's not useful # getChildren
    ch3 = ch3 | cli.apply(lambda _: f"{ogParent[7]}/{autoInc()}", 7) | cli.deref() # injects path into children # getChildren
    """
    tries to figure out what type of decomposition is this. There are 3 types right now:
    - partition: clear decomposition into multiple large components
    - delegation: parent has 1 giant component, and many other small components
    - weird: children overlap other children, children bigger than parent (indicating position absolute/fixed). Temporarily not enabled""" # getChildren
    px, py, pw, ph = ogParent[1:5]; pwh = pw*ph or 1e-3; pad = 10 # detect children bigger than parent # getChildren
    # if not (ch3 | cut(1, 2, 3, 4) | ~apply(lambda x,y,w,h: x >= px-pad and x+w <= px+pw+pad and y >= py-pad and y+h <= py+ph+pad) | aS(all)): ogParent[6] = "weird" # getChildren
    if False: pass                                                               # getChildren
    else:                                                                        # getChildren
        pAreas = ch3 | cli.cut(5) | cli.apply(cli.op()/pwh) | ~cli.sort(None) | cli.deref() | cli.aS(np.array) # %area of children # getChildren
        nAreas = pAreas / pAreas.sum() # normalized pAreas, so that all child areas add up to 1, to judge relative size of children, instead of area fractions alone # getChildren
        if nAreas[0] > 0.85 and pAreas[0] > 0.7: ogParent[6] = "delegation" # >0.7 part means that if the child dominates other children but doesn't dominate the parent, then it's a partition # getChildren
        else: ogParent[6] = "partition"                                          # getChildren
        for e, pA in zip(ch3, pAreas): e[8] = pA                                 # getChildren
    return ch3                                                                   # getChildren
def getStages(browser:Browser):                                                  # getStages
    body = getMeta(browser.find_element(by.By.CSS_SELECTOR, "body")); ch = [body]; stages = [ch]; autoInc = k1.AutoIncrement() # getStages
    for i in range(30):                                                          # getStages
        ch = ch | cli.apply(getChildren, autoInc=autoInc) | cli.joinSt() | cli.deref() # getStages
        if len(ch) == 0: break                                                   # getStages
        stages.append(ch)                                                        # getStages
    return stages                                                                # getStages
def draw(elem:Meta):                                                             # draw
    try:                                                                         # draw
        text = elem[0].text                                                      # draw
        p5.stroke(*([255, 0, 0] if elem[6] == "partition" else [0, 0, 0])); p5.noFill(); p5.rect(*elem[1:5]) # draw
        p5.noStroke(); p5.fill(0); p5.text((f"{elem[7]} - {elem[6]}\n" + text).split("\n") | cli.head(3) | cli.join("\n"), elem[1], elem[2]+10) # draw
    except: pass                                                                 # draw
def drawStages(stages:Stages):                                                   # drawStages
    body = stages[0][0]; imgs = []                                               # drawStages
    w = body[3]; h = body[4]                                                     # drawStages
    if w < 10 or h < 10: # some sites (looking at you, YouTube), have 0px body height, which screws the rendering up # drawStages
        w,h = stages | cli.joinSt() | cli.cut(1, 2, 3, 4) | ~cli.apply(lambda x,y,w,h: [x+w,y+h]) | cli.filt("x<3000", [0,1]) | cli.transpose() | cli.apply(~cli.sort(None) | ~cli.head(0.05) | cli.item() | cli.op()+20) # some site have elements way beyond bounds. So this is to filter out those elements # drawStages
    w = min(w, 3000); h = min(h, 3000) # again, making sure it doesn't draw something completely insane # drawStages
    for elems in stages:                                                         # drawStages
        p5.newSketch(w, h, False); p5.background(255)                            # drawStages
        elems | cli.apply(draw) | cli.deref(); imgs.append(p5.svg())             # drawStages
    return imgs                                                                  # drawStages

[docs]
class Page:                                                                      # Page
    def __init__(self, stages:Stages):                                           # Page
        self.stages = stages; self._imgs = None # cached images                  # Page
        self.path2Elem = stages | cli.joinSt() | cli.apply(lambda x: [x[7], x]) | cli.toDict() # Page
        self.idx2Elem = stages | cli.joinSt() | cli.apply(lambda x: [x[7].split("/")[-1], x]) | cli.toDict() # Page
        self.path2CPaths = stages | cli.joinSt() | cli.cut(7) | cli.apply(lambda x: ["/".join(x.split("/")[:-1]), x]) | cli.groupBy(0, True) | cli.apply(cli.item().all(), 1) | cli.deref() | cli.toDict(f=lambda:[]) # path to children paths # Page
        self.w, self.h = self.stages[0][0][3:5]; self.wh = self.w*self.h         # Page

[docs]
    @staticmethod                                                                # Page
    def analyze(browser, bounded=True):                                          # Page
        """Analyze whatever is on the browser at this moment.

:param bounded: if True, adjusts all internal bounding boxes so that they don't overflow. If False, then children can be bigger than its parent""" # Page
        page = Page(getStages(browser))                                          # Page
        return page.bound() if bounded else page                                 # Page


[docs]
    def draw(self):                                                              # Page
        """Quickly views each analysis stages, see bounding boxes, their paths and whatnot""" # Page
        if not self._imgs: self._imgs = drawStages(self.stages)                  # Page
        return self._imgs | k1.viz.Carousel()                                    # Page

    def __getitem__(self, s) -> "Element | None":                                # Page
        s = f"{s}"                                                               # Page
        if s in self.path2Elem: return Element(self, self.path2Elem[s])          # Page
        if s in self.idx2Elem: return Element(self, self.idx2Elem[s])            # Page

[docs]
    def mainContents(self):                                                      # Page
        """Returns some candidate Elements that seems to be the main content. You'd have
to write some minimum code to determine what you'd like to use, but the bulk
of the work by this point is done"""                                             # Page
        # 3 sorts: by area, by #children, by area again. Select fewer and fewer at each stage. This should weed out uninteresting elements # Page
        a = self.stages | cli.joinSt() | cli.filt(cli.op()=="partition", 6) | ~cli.sort(5) | cli.cut(7, 5) | cli.head(10) | cli.deref() # Page
        return a | cli.cut(0) | cli.lookup(self) | cli.deref()                   # Page
        # b = a | ~apply(lambda p,a: [p,a,len(self.path2CPaths[p])]) | ~sort(2) | head(5) | ~sort(1) | cut(0) | deref() # Page
        c = b | cli.apply(lambda path: self[path]) | cli.deref()                 # Page
        return [a, b, c][mode]                                                   # Page

    @property                                                                    # Page
    def mainContent(self):                                                       # Page
        """Really tries to extract out the main content, so that this can be
automated. It might not be good, but at least it's automated."""                 # Page
        try: return self.mainContents() | cli.filt("x.pArea < 0.7") | cli.item() # Page
        except: return self.mainContents() | cli.item()                          # Page

[docs]
    def bound(self) -> "Page":                                                   # Page
        """Some children is detached from the parent outside of it, and can grow bigger. But
that messes up with the ranking technique that I have right now. So purpose of this
is to limit the children to the parent's size"""                                 # Page
        stages = self.stages | cli.deref(); path2Meta = []                       # Page
        for stage in stages:                                                     # Page
            for elem in stage: path2Meta.append([elem[7], elem])                 # Page
        path2Meta = path2Meta | cli.toDict()                                     # Page
        for stage in stages[1:]:                                                 # Page
            for elem in stage:                                                   # Page
                parent = path2Meta[elem[7].split("/")[:-1] | cli.join("/")]      # Page
                ex1 = elem[1]; ex2 = elem[1] + elem[3]                           # Page
                ey1 = elem[2]; ey2 = elem[2] + elem[4]                           # Page
                px1 = parent[1]; px2 = parent[1] + parent[3]                     # Page
                py1 = parent[2]; py2 = parent[2] + parent[4]                     # Page
                elem[1] = max(ex1, px1); elem[3] = min(ex2, px2) - elem[1]       # Page
                elem[2] = max(ey1, py1); elem[4] = min(ey2, py2) - elem[2]       # Page
                elem[5] = elem[3]*elem[4]                                        # Page
        return Page(stages)                                                      # Page



[docs]
class Element:                                                                   # Element
    def __init__(self, page, data):                                              # Element
        self.page = page; self.data = data                                       # Element
    @property                                                                    # Element
    def parent(self):                                                            # Element
        """Grab this element's parent element. If not found, then return None""" # Element
        return self.page["/".join(self.data[7].split("/")[:-1])]                 # Element
    @property                                                                    # Element
    def obj(self): return self.data[0]                                           # Element
    @property                                                                    # Element
    def x(self): return self.data[1]                                             # Element
    @property                                                                    # Element
    def y(self): return self.data[2]                                             # Element
    @property                                                                    # Element
    def w(self): return self.data[3]                                             # Element
    @property                                                                    # Element
    def h(self): return self.data[4]                                             # Element
    @property                                                                    # Element
    def wh(self): return self.data[5]                                            # Element
    @property                                                                    # Element
    def path(self): return self.data[7]                                          # Element
    @property                                                                    # Element
    def children(self) -> "list[Element]": return [self.page[cp] for cp in self.page.path2CPaths[self.data[7]]] # Element
    @property                                                                    # Element
    def pArea(self):                                                             # Element
        """Percentage area of this element vs the entire page"""                 # Element
        return self.wh/(self.page.wh)                                            # Element
    def __repr__(self):                                                          # Element
        path = f"'{self.data[7]}'".ljust(30)                                     # Element
        x = f"{self.x}".ljust(4); y = f"{self.y}".ljust(4)                       # Element
        w = f"{self.w}".ljust(4); h = f"{self.h}".ljust(4)                       # Element
        type_ = f"'{self.data[6]}'".ljust(9)                                     # Element
        area = f"{round(self.data[8]*100)}%".rjust(4)                            # Element
        area2 = f"{round(self.pArea*100)}%".rjust(4)                             # Element
        nChildren = f"{len(self.children)}".ljust(4)                             # Element
        return f"<Element path={path} loc=({x}, {y}) size=({w}, {h}) type={type_} %area(parent)='{area}' %area(body)='{area2}' #children={nChildren}>" # Element