# AUTOGENERATED FILE! PLEASE DON'T EDIT HERE. EDIT THE SOURCE NOTEBOOKS INSTEAD
__all__ = ["grep", "grepTemplate"]
import re, k1lib, json
from k1lib.cli.init import BaseCli; import k1lib.cli.init as init
import k1lib.cli.kjs as kjs; import k1lib.cli as cli
from collections import deque; from typing import Iterator, Union, Callable, Any
def extGuard(cond, s=""): # extGuard
if cond: Exception(f"Can't use extract mode. {s}") # extGuard
def jsfGuard(cond, s=""): # jsfGuard
if cond: Exception(f"grep._jsF() does not support custom {s} parameter") # jsfGuard
inf = float("inf") # jsfGuard
[docs]class grep(BaseCli): # grep
[docs] def __init__(self, pattern:Union[str, Callable[[Any], bool]], before:int=0, after:int=0, N:int=float("inf"), sep:bool=False, col:int=None, extract:str=None): # grep
"""Find lines that has the specified pattern.
Example::
# returns ['d', 'd']
"abcde12d34" | grep("d") | deref()
# returns ['c', 'd', '2', 'd'], 2 sections of ['c', 'd'] and ['2', 'd']
"abcde12d34" | grep("d", 1) | deref()
# returns ['c', 'd']
"abcde12d34" | grep("d", 1, N=1) | deref()
# returns ['d', 'e', 'd', '3', '4'], 2 sections of ['d', 'e'] and ['d', '3', '4']
"abcde12d34" | grep("d", 0, 3).till("e") | deref()
# returns [['0', '1', '2'], ['3', '1', '4']]
"0123145" | grep("1", 2, 1, sep=True) | deref()
You can also separate out the sections::
# returns [['c', 'd'], ['2', 'd']]
"abcde12d34" | grep("d", 1, sep=True) | deref()
# returns [['c', 'd']]
"abcde12d34" | grep("d", 1, N=1, sep=True) | deref()
# returns [['1', '2', '3'], ['1', '4', '5']]
"0123145" | grep("1", sep=True).till() | deref()
You can also put in predicates instead of regex patterns::
# returns ['d', 'd']
"abcde12d34" | grep(lambda x: x == "d") | deref()
# also returns ['d', 'd']
"abcde12d34" | filt(lambda x: x == "d") | deref()
# returns ['d', 'e', 'd', '3', '4']
"abcde12d34" | grep(lambda x: x == "d").till(lambda x: x == "e") | deref()
The first scenario looks like a regular filter function, already implemented by :class:`~k1lib.cli.filt.filt`,
but :class:`grep` brings in more clustering features for the price of reduced
execution speed. So for simple scenarios it's advised that you use :class:`~k1lib.cli.filt.filt`.
See also: :class:`~k1lib.cli.structural.groupBy`
Also, there's a `whole tutorial <../tutorials/cli.html>`_ devoted to just this cli
Also also, if each element in the input iterator is not a string/bytes, and
you're searching using regex, then it will get its representation and searches
in it.
.. admonition:: Extract mode
Sometimes, you want to extract a subsection of a matched string, like extracting
links in a html file, then you can do something like this::
# returns ['a.io', 'b.com', 'c.net']
["href='a.io'", "href='b.com'", "href='c.net'"] | grep("href='(?P<g>.*)'", extract="g") | deref()
# returns [['a.io', 3], ['b.com', 4], ['c.net', 5]]
[["href='a.io'", 3], ["href='b.com'", 4], ["href='c.net'", 5]] | grep("href='(?P<g>.*)'", extract="g", col=0) | deref()
Essentially, you're defining the group with name "g" to be any string within a quote block
following "href", and then it will just extract out the group that you want. Because the
purpose of this mode is to extract matched objects, a few of the arguments don't really make
sense and thus are disabled, like "before", "after", "sep", "N"
Regex quick cheatsheet:
- `\\d`: digit (\\D for inverse)
- `^`: begin of string ($ for end of string)
- `\\w`: unicode word (\\W for inverse)
- `(?!...)`: matches if the inside does not match
- `(?P<name>...)`: matches as group "name"
- `A|B`: matches A or B
- `[aml]`: set of characters "a", "m" and "k"
- `a{3,5}`: matches character "a" 3 to 5 times ("aaa", "aaaa" and "aaaaa")
- `a*`: matches "a" 0 or more times (`a*?` matches "a" 0 or more times non-greedy)
:param pattern: regex pattern to search for in a line
:param before: lines before the hit. Outputs independent lines
:param after: lines after the hit. Outputs independent lines
:param N: max sections to output
:param sep: whether to separate out the sections as lists
:param col: searches for pattern in a specific column""" # grep
super().__init__(); self.pattern = pattern # grep
if isinstance(pattern, str): # grep
self._f = re.compile(pattern).search; self.mode = 0 # make func quickly accessible # grep
else: self._f = cli.op.solidify(pattern); self.mode = 1 # mode for either regex or normal funcs # grep
self.before = before; self.after = after; self.col = col; self.N = N; self.sep = sep; self.inverted = False # grep
self.tillPattern = None; self.tillAfter = None; self._tillF = lambda x: False; self.extract = extract # grep
if extract: # grep
extGuard(before, "`before` has to be zero") # grep
extGuard(after, "`after` has to be zero") # grep
extGuard(sep, "`sep` has to be False") # grep
extGuard(col is not None, "`col` has to be None. Cut out the column if you want") # grep
extGuard(N < inf, "`N` has to be infinite. Just use head() if you want to limit the number of results") # grep
[docs] def till(self, pattern:Union[str, Callable[[Any], bool]]=None): # grep
"""Greps until some other pattern appear. Inclusive, so you might want to
trim the last line. Example::
# returns ['5', '6', '7', '8'], includes last item
range(10) | join("") | grep("5").till("8") | deref()
# returns ['d', 'e', 'd', '3', '4']
"abcde12d34" | grep("d").till("e") | deref()
# returns ['d', 'e']
"abcde12d34" | grep("d", N=1).till("e") | deref()
If initial pattern and till pattern are the same, then you don't have use this method at
all. Instead, do something like this::
# returns ['1', '2', '3']
"0123145" | grep("1", after=1e9, N=1) | deref()""" # grep
if self.extract: extGuard(True, "Can't use .till() in extract mode as it makes no sense") # grep
if pattern is None: self._tillF = self._f # grep
elif isinstance(pattern, str): self._tillF = re.compile(pattern).search # grep
else: self._tillF = cli.op.solidify(pattern) # grep
self.tillAfter = self.after; self.after = inf; return self # grep
[docs] def __ror__(self, it:Iterator[str]) -> Iterator[str]: # grep
self.sectionIdx = 0; col = self.col; _f = self._f; _tillF = self._tillF # grep
if self.sep: # grep
elems = []; idx = 0 # grep
s = self._clone(); s.sep = False # grep
for line in (it | s): # grep
if s.sectionIdx > idx: # outputs whatever remaining # grep
if len(elems) > 0: yield list(elems) # grep
idx = s.sectionIdx; elems = [] # grep
elems.append(line) # grep
yield list(elems); return # grep
if self.extract: # grep
group = self.extract # grep
if col is None: # grep
for line in it: # grep
res = _f(line) # grep
if res: yield res.group(group) # grep
else: # grep
for line in it: # grep
line = list(line); res = _f(line[col]) # grep
if res: line[col] = res.group(group); yield line # grep
# grep
return # grep
queue = deque([], self.before); counter = 0 # remaining lines after to display # grep
cRO = k1lib.RunOnce(); cRO.done() # grep
for line in it: # grep
if col != None: line = list(line); elem = line[col] # grep
else: elem = line # grep
if self.mode == 0 and not isinstance(elem, (str, bytes)): elem = f"{elem}" # grep
if _f(elem): # new section # grep
self.sectionIdx += 1; counter = self.after+1; cRO.revert() # grep
if self.sectionIdx > self.N: return # grep
yield from queue; queue.clear(); yield line # grep
elif _tillF(elem) and counter == inf: # closing section # grep
counter = self.tillAfter + 1; cRO.revert(); yield line # grep
if counter == 0: # grep
queue.append(line) # saves recent past lines # grep
elif counter > 0: # yielding "after" section # grep
if cRO.done(): yield line # grep
counter -= 1 # grep
[docs] def __invert__(self): # grep
"""Flips the pattern, just like how :class:`~k1lib.cli.filt.filt`
works. Example::
# returns ['a', 'b', 'c', 'e', '1', '2', '3', '4']
"abcde12d34" | ~grep("d") | deref()""" # grep
if self.extract: extGuard(True, "Can't invert search condition in extract mode as it makes no sense") # grep
if self.inverted: raise Exception("Can't invert grep() more than twice, it makes no sense") # grep
ans = self._clone(); f = self._f; ans._f = lambda s: not f(s) # grep
ans.preInvGrep = self; ans.inverted = True; return ans # grep
def _clone(self): # grep
answer = grep(self.pattern, self.before, self.after, self.N, self.sep, self.col) # grep
answer._tillF = self._tillF; answer.tillAfter = self.tillAfter; return answer # grep
def _jsF(self, meta): # grep
jsfGuard(self.before != 0, "before"); jsfGuard(self.after != 0, "after") # grep
jsfGuard(self.N < float("inf"), "N"); jsfGuard(self.sep, "sep") # grep
jsfGuard(not self.extract is None, "extract"); inverted = False # grep
if self.inverted: inverted = True; self = self.preInvGrep # grep
if not isinstance(self.pattern, str): raise Exception(f"grep._jsF() does not support pattern that's not a string") # grep
fIdx = init._jsFAuto(); dataIdx = init._jsDAuto(); argIdx = init._jsDAuto(); p = self.pattern # grep
for x,y in self.pattern | grep("\ue157", sep=True).till("\ue239") | cli.apply(cli.join("")) | cli.filt("x") | cli.apply(lambda x: [x, x.replace("\ue157", "${").replace("\ue239", "}")]): p = p.replace(x, y) # grep
return f"{fIdx} = ({dataIdx}) => {dataIdx}.grep(`{p}`, {{col: {cli.kjs.v(self.col)}, inv: {json.dumps(inverted)}}})", fIdx # grep
[docs]class grepTemplate(BaseCli): # grepTemplate
[docs] def __init__(self, pattern:str, template:str): # grepTemplate
"""Searches over all lines, pick out the match, and expands
it to the templateand yields""" # grepTemplate
super().__init__() # grepTemplate
self.pattern = re.compile(pattern); self.template = template # grepTemplate
[docs] def __ror__(self, it:Iterator[str]): # grepTemplate
super().__ror__(it) # grepTemplate
for line in it: # grepTemplate
matchObj = self.pattern.search(line) # grepTemplate
if matchObj is None: continue # grepTemplate
yield matchObj.expand(self.template) # grepTemplate