# AUTOGENERATED FILE! PLEASE DON'T EDIT HERE. EDIT THE SOURCE NOTEBOOKS INSTEAD
"""Nice website analyzer based on Selenium. Whenever I used Selenium
in the past, I've always have to find how to reach specific buttons,
what paragraphs to capture, things like that, all manually. That
really limits my usage of it. Recently, I've made the module ``kapi``,
capable of doing a bunch of DL stuff and feels like this is the right
time to make something on top of Selenium that is able to pull out the
main article, body, header, right/left bar automatically. After that, I
can grab the main text to put it into an embedding db to use it further
downstream. Example::
from k1lib.imports import *
browser = selen.getBrowser() # fires up the browser
browser.get("https://en.wikipedia.org/wiki/Cheese")
page = selen.Page.analyze(browser) # analyze everything. For a typical page, should take 8-14s total
page.draw() # run this inside a notebook to view all stages
page.mainContents() # return List[Element] of potential big pieces of the page, like header, footer, left/right bar, etc.
elem = page.mainContent # grabs what it think is the most likely meats of the page, ignoring everything else
elem.obj # accesses the internal Selenium Element
elem.obj.text # grabs element's text content
elem.obj.x # grabs element's x location. Works with .y too
elem.obj.w # grabs element's width. Works with .h too
"""
import k1lib as k1, numpy as np; import k1lib.cli as cli; import k1lib.p5 as p5
from typing import List
by = k1.dep("selenium.webdriver.common.by")
__all__ = ["getBrowser", "Page", "Element"]
# dummy classes representing type information
class Meta: pass # obj:BaseElement, x, y, w, h, wh (5), type, path, %area of parent (8) # Meta
class Browser: pass # Browser
Stages = List[List[Meta]] # Browser
[docs]def getBrowser() -> Browser: # getBrowser
"""Launches new browser and return object to manage""" # getBrowser
return k1.dep("selenium.webdriver", "selenium==4.8.3").Chrome(k1.dep("webdriver_manager.chrome", "webdriver_manager==4.0.1").ChromeDriverManager().install()) # getBrowser
def getMeta(e:"Element") -> Meta: # getMeta
loc = e.location; size = e.size # getMeta
return [e, loc["x"], loc["y"], size["width"], size["height"], size["width"]*size["height"], None, "r", 0] # getMeta
def getChildren(parent:Meta, autoInc): # tries to discover children of this element that are somewhat reasonable # getChildren
ogParent = parent # getChildren
while True: # getChildren
pw = parent[3]; ph = parent[4] # getChildren
try: ch1 = parent[0].find_elements(by.By.XPATH, "*") | cli.apply(getMeta) | cli.deref() # "ch" for children # getChildren
except: return [] # elements can go stale! # getChildren
ch2 = ch1 | cli.filt("x>2000", 5) | cli.deref(); ch3 = ch2 # getChildren
if len(ch3) == 0: return [] # short circuits # getChildren
if len(ch3) > 1: break # getChildren
parent = ch3[0] # child encompass nearly all of parent, so loop around more, because it's not useful # getChildren
ch3 = ch3 | cli.apply(lambda _: f"{ogParent[7]}/{autoInc()}", 7) | cli.deref() # injects path into children # getChildren
"""
tries to figure out what type of decomposition is this. There are 3 types right now:
- partition: clear decomposition into multiple large components
- delegation: parent has 1 giant component, and many other small components
- weird: children overlap other children, children bigger than parent (indicating position absolute/fixed). Temporarily not enabled""" # getChildren
px, py, pw, ph = ogParent[1:5]; pwh = pw*ph or 1e-3; pad = 10 # detect children bigger than parent # getChildren
# if not (ch3 | cut(1, 2, 3, 4) | ~apply(lambda x,y,w,h: x >= px-pad and x+w <= px+pw+pad and y >= py-pad and y+h <= py+ph+pad) | aS(all)): ogParent[6] = "weird" # getChildren
if False: pass # getChildren
else: # getChildren
pAreas = ch3 | cli.cut(5) | cli.apply(cli.op()/pwh) | ~cli.sort(None) | cli.deref() | cli.aS(np.array) # %area of children # getChildren
nAreas = pAreas / pAreas.sum() # normalized pAreas, so that all child areas add up to 1, to judge relative size of children, instead of area fractions alone # getChildren
if nAreas[0] > 0.85 and pAreas[0] > 0.7: ogParent[6] = "delegation" # >0.7 part means that if the child dominates other children but doesn't dominate the parent, then it's a partition # getChildren
else: ogParent[6] = "partition" # getChildren
for e, pA in zip(ch3, pAreas): e[8] = pA # getChildren
return ch3 # getChildren
def getStages(browser:Browser): # getStages
body = getMeta(browser.find_element(by.By.CSS_SELECTOR, "body")); ch = [body]; stages = [ch]; autoInc = k1.AutoIncrement() # getStages
for i in range(30): # getStages
ch = ch | cli.apply(getChildren, autoInc=autoInc) | cli.joinSt() | cli.deref() # getStages
if len(ch) == 0: break # getStages
stages.append(ch) # getStages
return stages # getStages
def draw(elem:Meta): # draw
try: # draw
text = elem[0].text # draw
p5.stroke(*([255, 0, 0] if elem[6] == "partition" else [0, 0, 0])); p5.noFill(); p5.rect(*elem[1:5]) # draw
p5.noStroke(); p5.fill(0); p5.text((f"{elem[7]} - {elem[6]}\n" + text).split("\n") | cli.head(3) | cli.join("\n"), elem[1], elem[2]+10) # draw
except: pass # draw
def drawStages(stages:Stages): # drawStages
body = stages[0][0]; imgs = [] # drawStages
w = body[3]; h = body[4] # drawStages
if w < 10 or h < 10: # some sites (looking at you, YouTube), have 0px body height, which screws the rendering up # drawStages
w,h = stages | cli.joinSt() | cli.cut(1, 2, 3, 4) | ~cli.apply(lambda x,y,w,h: [x+w,y+h]) | cli.filt("x<3000", [0,1]) | cli.transpose() | cli.apply(~cli.sort(None) | ~cli.head(0.05) | cli.item() | cli.op()+20) # some site have elements way beyond bounds. So this is to filter out those elements # drawStages
w = min(w, 3000); h = min(h, 3000) # again, making sure it doesn't draw something completely insane # drawStages
for elems in stages: # drawStages
p5.newSketch(w, h, False); p5.background(255) # drawStages
elems | cli.apply(draw) | cli.deref(); imgs.append(p5.svg()) # drawStages
return imgs # drawStages
[docs]class Page: # Page
def __init__(self, stages:Stages): # Page
self.stages = stages; self._imgs = None # cached images # Page
self.path2Elem = stages | cli.joinSt() | cli.apply(lambda x: [x[7], x]) | cli.toDict() # Page
self.idx2Elem = stages | cli.joinSt() | cli.apply(lambda x: [x[7].split("/")[-1], x]) | cli.toDict() # Page
self.path2CPaths = stages | cli.joinSt() | cli.cut(7) | cli.apply(lambda x: ["/".join(x.split("/")[:-1]), x]) | cli.groupBy(0, True) | cli.apply(cli.item().all(), 1) | cli.deref() | cli.toDict(f=lambda:[]) # path to children paths # Page
self.w, self.h = self.stages[0][0][3:5]; self.wh = self.w*self.h # Page
[docs] @staticmethod # Page
def analyze(browser, bounded=True): # Page
"""Analyze whatever is on the browser at this moment.
:param bounded: if True, adjusts all internal bounding boxes so that they don't overflow. If False, then children can be bigger than its parent""" # Page
page = Page(getStages(browser)) # Page
return page.bound() if bounded else page # Page
[docs] def draw(self): # Page
"""Quickly views each analysis stages, see bounding boxes, their paths and whatnot""" # Page
if not self._imgs: self._imgs = drawStages(self.stages) # Page
return self._imgs | k1.viz.Carousel() # Page
def __getitem__(self, s) -> "Element | None": # Page
s = f"{s}" # Page
if s in self.path2Elem: return Element(self, self.path2Elem[s]) # Page
if s in self.idx2Elem: return Element(self, self.idx2Elem[s]) # Page
[docs] def mainContents(self): # Page
"""Returns some candidate Elements that seems to be the main content. You'd have
to write some minimum code to determine what you'd like to use, but the bulk
of the work by this point is done""" # Page
# 3 sorts: by area, by #children, by area again. Select fewer and fewer at each stage. This should weed out uninteresting elements # Page
a = self.stages | cli.joinSt() | cli.filt(cli.op()=="partition", 6) | ~cli.sort(5) | cli.cut(7, 5) | cli.head(10) | cli.deref() # Page
return a | cli.cut(0) | cli.lookup(self) | cli.deref() # Page
# b = a | ~apply(lambda p,a: [p,a,len(self.path2CPaths[p])]) | ~sort(2) | head(5) | ~sort(1) | cut(0) | deref() # Page
c = b | cli.apply(lambda path: self[path]) | cli.deref() # Page
return [a, b, c][mode] # Page
@property # Page
def mainContent(self): # Page
"""Really tries to extract out the main content, so that this can be
automated. It might not be good, but at least it's automated.""" # Page
try: return self.mainContents() | cli.filt("x.pArea < 0.7") | cli.item() # Page
except: return self.mainContents() | cli.item() # Page
[docs] def bound(self) -> "Page": # Page
"""Some children is detached from the parent outside of it, and can grow bigger. But
that messes up with the ranking technique that I have right now. So purpose of this
is to limit the children to the parent's size""" # Page
stages = self.stages | cli.deref(); path2Meta = [] # Page
for stage in stages: # Page
for elem in stage: path2Meta.append([elem[7], elem]) # Page
path2Meta = path2Meta | cli.toDict() # Page
for stage in stages[1:]: # Page
for elem in stage: # Page
parent = path2Meta[elem[7].split("/")[:-1] | cli.join("/")] # Page
ex1 = elem[1]; ex2 = elem[1] + elem[3] # Page
ey1 = elem[2]; ey2 = elem[2] + elem[4] # Page
px1 = parent[1]; px2 = parent[1] + parent[3] # Page
py1 = parent[2]; py2 = parent[2] + parent[4] # Page
elem[1] = max(ex1, px1); elem[3] = min(ex2, px2) - elem[1] # Page
elem[2] = max(ey1, py1); elem[4] = min(ey2, py2) - elem[2] # Page
elem[5] = elem[3]*elem[4] # Page
return Page(stages) # Page
[docs]class Element: # Element
def __init__(self, page, data): # Element
self.page = page; self.data = data # Element
@property # Element
def parent(self): # Element
"""Grab this element's parent element. If not found, then return None""" # Element
return self.page["/".join(self.data[7].split("/")[:-1])] # Element
@property # Element
def obj(self): return self.data[0] # Element
@property # Element
def x(self): return self.data[1] # Element
@property # Element
def y(self): return self.data[2] # Element
@property # Element
def w(self): return self.data[3] # Element
@property # Element
def h(self): return self.data[4] # Element
@property # Element
def wh(self): return self.data[5] # Element
@property # Element
def path(self): return self.data[7] # Element
@property # Element
def children(self) -> "list[Element]": return [self.page[cp] for cp in self.page.path2CPaths[self.data[7]]] # Element
@property # Element
def pArea(self): # Element
"""Percentage area of this element vs the entire page""" # Element
return self.wh/(self.page.wh) # Element
def __repr__(self): # Element
path = f"'{self.data[7]}'".ljust(30) # Element
x = f"{self.x}".ljust(4); y = f"{self.y}".ljust(4) # Element
w = f"{self.w}".ljust(4); h = f"{self.h}".ljust(4) # Element
type_ = f"'{self.data[6]}'".ljust(9) # Element
area = f"{round(self.data[8]*100)}%".rjust(4) # Element
area2 = f"{round(self.pArea*100)}%".rjust(4) # Element
nChildren = f"{len(self.children)}".ljust(4) # Element
return f"<Element path={path} loc=({x}, {y}) size=({w}, {h}) type={type_} %area(parent)='{area}' %area(body)='{area2}' #children={nChildren}>" # Element