Source code for k1lib.selen

# AUTOGENERATED FILE! PLEASE DON'T EDIT HERE. EDIT THE SOURCE NOTEBOOKS INSTEAD
"""Nice website analyzer based on Selenium. Whenever I used Selenium
in the past, I've always have to find how to reach specific buttons,
what paragraphs to capture, things like that, all manually. That
really limits my usage of it. Recently, I've made the module ``kapi``,
capable of doing a bunch of DL stuff and feels like this is the right
time to make something on top of Selenium that is able to pull out the
main article, body, header, right/left bar automatically. After that, I
can grab the main text to put it into an embedding db to use it further
downstream. Example::

    from k1lib.imports import *

    browser = selen.getBrowser() # fires up the browser
    browser.get("https://en.wikipedia.org/wiki/Cheese")

    page = selen.Page.analyze(browser) # analyze everything. For a typical page, should take 8-14s total
    page.draw()             # run this inside a notebook to view all stages
    page.mainContents()     # return List[Element] of potential big pieces of the page, like header, footer, left/right bar, etc.

    elem = page.mainContent # grabs what it think is the most likely meats of the page, ignoring everything else
    elem.obj                # accesses the internal Selenium Element
    elem.obj.text           # grabs element's text content
    elem.obj.x              # grabs element's x location. Works with .y too
    elem.obj.w              # grabs element's width. Works with .h too
"""
import k1lib as k1, numpy as np; import k1lib.cli as cli; import k1lib.p5 as p5
from typing import List
by = k1.dep("selenium.webdriver.common.by")
__all__ = ["getBrowser", "Page", "Element"]
# dummy classes representing type information
class Meta: pass # obj:BaseElement, x, y, w, h, wh (5), type, path, %area of parent (8) # Meta
class Browser: pass                                                              # Browser
Stages = List[List[Meta]]                                                        # Browser
[docs]def getBrowser() -> Browser: # getBrowser """Launches new browser and return object to manage""" # getBrowser return k1.dep("selenium.webdriver", "selenium==4.8.3").Chrome(k1.dep("webdriver_manager.chrome", "webdriver_manager==4.0.1").ChromeDriverManager().install()) # getBrowser
def getMeta(e:"Element") -> Meta: # getMeta loc = e.location; size = e.size # getMeta return [e, loc["x"], loc["y"], size["width"], size["height"], size["width"]*size["height"], None, "r", 0] # getMeta def getChildren(parent:Meta, autoInc): # tries to discover children of this element that are somewhat reasonable # getChildren ogParent = parent # getChildren while True: # getChildren pw = parent[3]; ph = parent[4] # getChildren try: ch1 = parent[0].find_elements(by.By.XPATH, "*") | cli.apply(getMeta) | cli.deref() # "ch" for children # getChildren except: return [] # elements can go stale! # getChildren ch2 = ch1 | cli.filt("x>2000", 5) | cli.deref(); ch3 = ch2 # getChildren if len(ch3) == 0: return [] # short circuits # getChildren if len(ch3) > 1: break # getChildren parent = ch3[0] # child encompass nearly all of parent, so loop around more, because it's not useful # getChildren ch3 = ch3 | cli.apply(lambda _: f"{ogParent[7]}/{autoInc()}", 7) | cli.deref() # injects path into children # getChildren """ tries to figure out what type of decomposition is this. There are 3 types right now: - partition: clear decomposition into multiple large components - delegation: parent has 1 giant component, and many other small components - weird: children overlap other children, children bigger than parent (indicating position absolute/fixed). Temporarily not enabled""" # getChildren px, py, pw, ph = ogParent[1:5]; pwh = pw*ph or 1e-3; pad = 10 # detect children bigger than parent # getChildren # if not (ch3 | cut(1, 2, 3, 4) | ~apply(lambda x,y,w,h: x >= px-pad and x+w <= px+pw+pad and y >= py-pad and y+h <= py+ph+pad) | aS(all)): ogParent[6] = "weird" # getChildren if False: pass # getChildren else: # getChildren pAreas = ch3 | cli.cut(5) | cli.apply(cli.op()/pwh) | ~cli.sort(None) | cli.deref() | cli.aS(np.array) # %area of children # getChildren nAreas = pAreas / pAreas.sum() # normalized pAreas, so that all child areas add up to 1, to judge relative size of children, instead of area fractions alone # getChildren if nAreas[0] > 0.85 and pAreas[0] > 0.7: ogParent[6] = "delegation" # >0.7 part means that if the child dominates other children but doesn't dominate the parent, then it's a partition # getChildren else: ogParent[6] = "partition" # getChildren for e, pA in zip(ch3, pAreas): e[8] = pA # getChildren return ch3 # getChildren def getStages(browser:Browser): # getStages body = getMeta(browser.find_element(by.By.CSS_SELECTOR, "body")); ch = [body]; stages = [ch]; autoInc = k1.AutoIncrement() # getStages for i in range(30): # getStages ch = ch | cli.apply(getChildren, autoInc=autoInc) | cli.joinSt() | cli.deref() # getStages if len(ch) == 0: break # getStages stages.append(ch) # getStages return stages # getStages def draw(elem:Meta): # draw try: # draw text = elem[0].text # draw p5.stroke(*([255, 0, 0] if elem[6] == "partition" else [0, 0, 0])); p5.noFill(); p5.rect(*elem[1:5]) # draw p5.noStroke(); p5.fill(0); p5.text((f"{elem[7]} - {elem[6]}\n" + text).split("\n") | cli.head(3) | cli.join("\n"), elem[1], elem[2]+10) # draw except: pass # draw def drawStages(stages:Stages): # drawStages body = stages[0][0]; imgs = [] # drawStages w = body[3]; h = body[4] # drawStages if w < 10 or h < 10: # some sites (looking at you, YouTube), have 0px body height, which screws the rendering up # drawStages w,h = stages | cli.joinSt() | cli.cut(1, 2, 3, 4) | ~cli.apply(lambda x,y,w,h: [x+w,y+h]) | cli.filt("x<3000", [0,1]) | cli.transpose() | cli.apply(~cli.sort(None) | ~cli.head(0.05) | cli.item() | cli.op()+20) # some site have elements way beyond bounds. So this is to filter out those elements # drawStages w = min(w, 3000); h = min(h, 3000) # again, making sure it doesn't draw something completely insane # drawStages for elems in stages: # drawStages p5.newSketch(w, h, False); p5.background(255) # drawStages elems | cli.apply(draw) | cli.deref(); imgs.append(p5.svg()) # drawStages return imgs # drawStages
[docs]class Page: # Page def __init__(self, stages:Stages): # Page self.stages = stages; self._imgs = None # cached images # Page self.path2Elem = stages | cli.joinSt() | cli.apply(lambda x: [x[7], x]) | cli.toDict() # Page self.idx2Elem = stages | cli.joinSt() | cli.apply(lambda x: [x[7].split("/")[-1], x]) | cli.toDict() # Page self.path2CPaths = stages | cli.joinSt() | cli.cut(7) | cli.apply(lambda x: ["/".join(x.split("/")[:-1]), x]) | cli.groupBy(0, True) | cli.apply(cli.item().all(), 1) | cli.deref() | cli.toDict(f=lambda:[]) # path to children paths # Page self.w, self.h = self.stages[0][0][3:5]; self.wh = self.w*self.h # Page
[docs] @staticmethod # Page def analyze(browser, bounded=True): # Page """Analyze whatever is on the browser at this moment. :param bounded: if True, adjusts all internal bounding boxes so that they don't overflow. If False, then children can be bigger than its parent""" # Page page = Page(getStages(browser)) # Page return page.bound() if bounded else page # Page
[docs] def draw(self): # Page """Quickly views each analysis stages, see bounding boxes, their paths and whatnot""" # Page if not self._imgs: self._imgs = drawStages(self.stages) # Page return self._imgs | k1.viz.Carousel() # Page
def __getitem__(self, s) -> "Element | None": # Page s = f"{s}" # Page if s in self.path2Elem: return Element(self, self.path2Elem[s]) # Page if s in self.idx2Elem: return Element(self, self.idx2Elem[s]) # Page
[docs] def mainContents(self): # Page """Returns some candidate Elements that seems to be the main content. You'd have to write some minimum code to determine what you'd like to use, but the bulk of the work by this point is done""" # Page # 3 sorts: by area, by #children, by area again. Select fewer and fewer at each stage. This should weed out uninteresting elements # Page a = self.stages | cli.joinSt() | cli.filt(cli.op()=="partition", 6) | ~cli.sort(5) | cli.cut(7, 5) | cli.head(10) | cli.deref() # Page return a | cli.cut(0) | cli.lookup(self) | cli.deref() # Page # b = a | ~apply(lambda p,a: [p,a,len(self.path2CPaths[p])]) | ~sort(2) | head(5) | ~sort(1) | cut(0) | deref() # Page c = b | cli.apply(lambda path: self[path]) | cli.deref() # Page return [a, b, c][mode] # Page
@property # Page def mainContent(self): # Page """Really tries to extract out the main content, so that this can be automated. It might not be good, but at least it's automated.""" # Page try: return self.mainContents() | cli.filt("x.pArea < 0.7") | cli.item() # Page except: return self.mainContents() | cli.item() # Page
[docs] def bound(self) -> "Page": # Page """Some children is detached from the parent outside of it, and can grow bigger. But that messes up with the ranking technique that I have right now. So purpose of this is to limit the children to the parent's size""" # Page stages = self.stages | cli.deref(); path2Meta = [] # Page for stage in stages: # Page for elem in stage: path2Meta.append([elem[7], elem]) # Page path2Meta = path2Meta | cli.toDict() # Page for stage in stages[1:]: # Page for elem in stage: # Page parent = path2Meta[elem[7].split("/")[:-1] | cli.join("/")] # Page ex1 = elem[1]; ex2 = elem[1] + elem[3] # Page ey1 = elem[2]; ey2 = elem[2] + elem[4] # Page px1 = parent[1]; px2 = parent[1] + parent[3] # Page py1 = parent[2]; py2 = parent[2] + parent[4] # Page elem[1] = max(ex1, px1); elem[3] = min(ex2, px2) - elem[1] # Page elem[2] = max(ey1, py1); elem[4] = min(ey2, py2) - elem[2] # Page elem[5] = elem[3]*elem[4] # Page return Page(stages) # Page
[docs]class Element: # Element def __init__(self, page, data): # Element self.page = page; self.data = data # Element @property # Element def parent(self): # Element """Grab this element's parent element. If not found, then return None""" # Element return self.page["/".join(self.data[7].split("/")[:-1])] # Element @property # Element def obj(self): return self.data[0] # Element @property # Element def x(self): return self.data[1] # Element @property # Element def y(self): return self.data[2] # Element @property # Element def w(self): return self.data[3] # Element @property # Element def h(self): return self.data[4] # Element @property # Element def wh(self): return self.data[5] # Element @property # Element def path(self): return self.data[7] # Element @property # Element def children(self) -> "list[Element]": return [self.page[cp] for cp in self.page.path2CPaths[self.data[7]]] # Element @property # Element def pArea(self): # Element """Percentage area of this element vs the entire page""" # Element return self.wh/(self.page.wh) # Element def __repr__(self): # Element path = f"'{self.data[7]}'".ljust(30) # Element x = f"{self.x}".ljust(4); y = f"{self.y}".ljust(4) # Element w = f"{self.w}".ljust(4); h = f"{self.h}".ljust(4) # Element type_ = f"'{self.data[6]}'".ljust(9) # Element area = f"{round(self.data[8]*100)}%".rjust(4) # Element area2 = f"{round(self.pArea*100)}%".rjust(4) # Element nChildren = f"{len(self.children)}".ljust(4) # Element return f"<Element path={path} loc=({x}, {y}) size=({w}, {h}) type={type_} %area(parent)='{area}' %area(body)='{area2}' #children={nChildren}>" # Element