Unexpected Success Stories
Dimiter Naydenov
@dimitern
from Bulgaria.Sofia import Dimiter.Naydenov
Python
, Emacs
, Go
, Ubuntu
, Diving, Sci-Fidevelated
import pandas as pd
Goal: Send personalized mail, labeled in sender's handwriting.
Excerpt of a user's SVG sample page.
Generated SVG mail label for another user.
Problem: Extracting pen strokes from SVG XML
Solution: I found svgpathtools which provides:
Path
(base), Line
, CubicBezier
, QuadraticBezier
import svgpathtools as spt
def parse_svg(filename):
paths, attrs = spt.svg2paths(filename)
# paths: list of Path instances
# attrs: list of dicts with XML attributes
return paths, attrs
import pandas as pd
def gen_records(svg_paths):
for i, path in enumerate(svg_paths):
xmin, xmax, ymin, ymax = path.bbox()
yield dict(org_idx=i, xmin=xmin, ymin=ymin,
xmax=xmax, ymax=ymax, path=path)
def load_paths(filename):
paths, _ = parse_svg(filename)
return pd.DataFrame.from_records(gen_records(paths))
orgidx | xmin | ymin | xmax | ymax | path |
---|---|---|---|---|---|
0 | x0 | y0 | X0 | Y0 | p1 |
… | |||||
n-1 | xn-1 | yn-1 | Xn-1 | Yn-1 | pn-1 |
Problem: Compare each stroke with all nearby strokes and merge as letters
Solution: DateFrame iteration and filtering (over multiple passes)
def merge_letters(df, merged, unmerged):
merged = set([])
unmerged = set(df.loc['org_idx'].tolist())
df = merge_dots(df, merged, unmerged)
df = merge_overlapping(df, merged, unmerged)
df = merge_crossing_below(df, merged, unmerged)
df = merge_crossing_above(df, merged, unmerged)
df = merge_crossing_before(df, merged, unmerged)
df = merge_crossing_after(df, merged, unmerged)
return df, merged, unmerged
def merge_overlapping(df, merged, unmerged):
"""Merges paths whose bboxes overlap completely."""
for path in df.itertuples():
candidates = df[(
(df.xmin < path.xmin) &
(df.xmax > path.xmax) &
(df.ymin < path.ymin) &
(df.ymax > path.ymax) &
)]
df = merge_candidates(df, path.Index, candidates.org_idx.values, merged, unmerged)
return update_data_frame(df)
def update_data_frame(df):
"""Calculates additional properties of each path."""
return (df.assign(
width=lambda df: df.xmax - df.xmin,
height=lambda df: df.ymax - df.ymin).assign(
half_width=lambda df: df.width / 2,
half_height=lambda df: df.height / 2,
area=lambda df: df.width * df.height,
aspect=lambda df: df.width / df.height)
.sort_values(['ymin', 'ymax', 'xmin', 'xmax']))
testing
)
Example (showing letter bounding boxes and baseline)
pd.read_excel()
MultiIndex
How to get in touch:
@dimitern
One more thing,
buy Wes McKinney's book "Python for Data Analysis" (seriously)