https://karnwong.me/posts/rss.xml

Visualizing map region prefix/suffix

2020-09-03
import geopandas as gpd
import geoplot as gplt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from geoplot import polyplot
from pythainlp.tokenize import syllable_tokenize, word_tokenize

Data structure

  • name: target region name
  • geometry: spatial column
  • *: parent region name, e.g. in "district" dataset it would have a "province" column

Dissolving dataset in case you have multiple region level in the same file

## assuming you have a district dataset and want to dissolve to province only
district_filename = "FILE_PATH_HERE"

gdf = gpd.read_file(district_filename)

used_columns = [
    "province",
    "district",
]

gdf = gdf.rename(
    columns={
        "prov_namt".upper(): "province",  # change to dummy
        "amp_namt".upper(): "district",
    }
)

gdf = gdf[used_columns + ["geometry"]]

## desired data 🛎🛎🛎 please do create a datasest with outermost region, so we can use it as boundary for visualization
province = gdf.dissolve(by="province")
province = (
    province.reset_index().rename(columns={"province": "name"}).drop(columns="district")
)
province
namegeometry
0āļāļĢāļ°āļšāļĩāđˆMULTIPOLYGON (((99.14285 7.57282, 99.14256 7.5...
1āļāļĢāļļāļ‡āđ€āļ—āļžāļĄāļŦāļēāļ™āļ„āļĢPOLYGON ((100.51756 13.66185, 100.51754 13.661...
2āļāļēāļāļˆāļ™āļšāļļāļĢāļĩPOLYGON ((99.76845 14.09449, 99.76898 14.09458...
3āļāļēāļŽāļŠāļīāļ™āļ˜āļļāđŒPOLYGON ((103.54900 16.21370, 103.54763 16.213...
4āļāļģāđāļžāļ‡āđ€āļžāļŠāļĢPOLYGON ((99.97734 16.11070, 99.97546 16.10861...
.........
71āđ€āļžāļŠāļĢāļšāļļāļĢāļĩPOLYGON ((100.02689 12.91666, 100.02690 12.916...
72āđ€āļžāļŠāļĢāļšāļđāļĢāļ“āđŒPOLYGON ((101.30859 15.57351, 101.30821 15.566...
73āđ€āļĨāļĒPOLYGON ((102.01428 17.14017, 102.01439 17.140...
74āđāļžāļĢāđˆPOLYGON ((99.64157 18.05575, 99.64237 18.05561...
75āđāļĄāđˆāļŪāđˆāļ­āļ‡āļŠāļ­āļ™POLYGON ((98.16045 18.15059, 98.16069 18.15037...

76 rows × 2 columns

## declare dummy variable so it can be reused with other region type
df = province

EDA: tokenize region name. Use other tokenizer for your target language

def tokenize(unique_region_values):
    """
    input: unique values of region type
    return: dataframe with token columns
    """

    temp = pd.DataFrame()
    temp["name"] = pd.Series(unique_region_values)
    temp["token"] = temp["name"].apply(lambda x: syllable_tokenize(x))

    # Thai doesn't use space to separate words, so it's a bit wonky
    # when I tell it to do such, that's why I need to see the results
    # manually, and in some cases it may "clip" a token
    temp["token_1-1"] = temp.token.str[0]
    temp["token_1-2"] = temp.token.str[1]
    temp["token_1_full"] = temp["token_1-1"] + temp["token_1-2"]

    temp["token_2-1"] = temp.token.str[-2]
    temp["token_2-2"] = temp.token.str[-1]
    temp["token_2_full"] = temp["token_2-1"] + temp["token_2-2"]

    return temp

Don't forget to look through the results and pick tokens you think are "correct"

tokenize(df.name.unique())
nametokentoken_1-1token_1-2token_1_fulltoken_2-1token_2-2token_2_full
0āļāļĢāļ°āļšāļĩāđˆ[āļāļĢāļ°, āļšāļĩāđˆ]āļāļĢāļ°āļšāļĩāđˆāļāļĢāļ°āļšāļĩāđˆāļāļĢāļ°āļšāļĩāđˆāļāļĢāļ°āļšāļĩāđˆ
1āļāļĢāļļāļ‡āđ€āļ—āļžāļĄāļŦāļēāļ™āļ„āļĢ[āļāļĢāļļāļ‡, āđ€āļ—āļž, āļĄāļŦāļē, āļ™āļ„āļĢ]āļāļĢāļļāļ‡āđ€āļ—āļžāļāļĢāļļāļ‡āđ€āļ—āļžāļĄāļŦāļēāļ™āļ„āļĢāļĄāļŦāļēāļ™āļ„āļĢ
2āļāļēāļāļˆāļ™āļšāļļāļĢāļĩ[āļāļēāļ, āļˆāļ™, āļšāļļ, āļĢāļĩ]āļāļēāļāļˆāļ™āļāļēāļāļˆāļ™āļšāļļāļĢāļĩāļšāļļāļĢāļĩ
3āļāļēāļŽāļŠāļīāļ™āļ˜āļļāđŒ[āļāļēāļŽ, āļŠāļīāļ™āļ˜āļļāđŒ]āļāļēāļŽāļŠāļīāļ™āļ˜āļļāđŒāļāļēāļŽāļŠāļīāļ™āļ˜āļļāđŒāļāļēāļŽāļŠāļīāļ™āļ˜āļļāđŒāļāļēāļŽāļŠāļīāļ™āļ˜āļļāđŒ
4āļāļģāđāļžāļ‡āđ€āļžāļŠāļĢ[āļāļģ, āđāļžāļ‡, āđ€āļžāļŠāļĢ]āļāļģāđāļžāļ‡āļāļģāđāļžāļ‡āđāļžāļ‡āđ€āļžāļŠāļĢāđāļžāļ‡āđ€āļžāļŠāļĢ
...........................
71āđ€āļžāļŠāļĢāļšāļļāļĢāļĩ[āđ€āļžāļŠāļĢ, āļšāļļ, āļĢāļĩ]āđ€āļžāļŠāļĢāļšāļļāđ€āļžāļŠāļĢāļšāļļāļšāļļāļĢāļĩāļšāļļāļĢāļĩ
72āđ€āļžāļŠāļĢāļšāļđāļĢāļ“āđŒ[āđ€āļžāļŠāļĢ, āļšāļđāļĢāļ“āđŒ]āđ€āļžāļŠāļĢāļšāļđāļĢāļ“āđŒāđ€āļžāļŠāļĢāļšāļđāļĢāļ“āđŒāđ€āļžāļŠāļĢāļšāļđāļĢāļ“āđŒāđ€āļžāļŠāļĢāļšāļđāļĢāļ“āđŒ
73āđ€āļĨāļĒ[āđ€āļĨāļĒ]āđ€āļĨāļĒNaNNaNNaNāđ€āļĨāļĒNaN
74āđāļžāļĢāđˆ[āđāļžāļĢāđˆ]āđāļžāļĢāđˆNaNNaNNaNāđāļžāļĢāđˆNaN
75āđāļĄāđˆāļŪāđˆāļ­āļ‡āļŠāļ­āļ™[āđāļĄāđˆ, āļŪāđˆāļ­āļ‡, āļŠāļ­āļ™]āđāļĄāđˆāļŪāđˆāļ­āļ‡āđāļĄāđˆāļŪāđˆāļ­āļ‡āļŪāđˆāļ­āļ‡āļŠāļ­āļ™āļŪāđˆāļ­āļ‡āļŠāļ­āļ™

76 rows × 8 columns

Tokenize with selected slugs

## replace with your slugs here
slugs = ["āļ™āļ„āļĢ", "āļŠāļļ", "āļŠāļĄāļļāļ—āļĢ", "āļ˜āļēāļ™āļĩ", "āļ™āļ„āļĢ"]
slugs = sorted(list(set(slugs)))
slugs = slugs[::-1]  # for longest matching


## get prefix and suffix
def get_slug_1(x):
    for i in slugs:
        if x.startswith(i):
            return i


def get_slug_2(x):
    for i in slugs:
        if x.endswith(i):
            return i
df["prefix"] = df["name"].apply(lambda x: get_slug_1(x))
df["suffix"] = df["name"].apply(lambda x: get_slug_2(x))

df
namegeometryprefixsuffixclass
0āļāļĢāļ°āļšāļĩāđˆMULTIPOLYGON (((99.14285 7.57282, 99.14256 7.5...NoneNoneclass
1āļāļĢāļļāļ‡āđ€āļ—āļžāļĄāļŦāļēāļ™āļ„āļĢPOLYGON ((100.51756 13.66185, 100.51754 13.661...Noneāļ™āļ„āļĢclass
2āļāļēāļāļˆāļ™āļšāļļāļĢāļĩPOLYGON ((99.76845 14.09449, 99.76898 14.09458...NoneNoneclass
3āļāļēāļŽāļŠāļīāļ™āļ˜āļļāđŒPOLYGON ((103.54900 16.21370, 103.54763 16.213...NoneNoneclass
4āļāļģāđāļžāļ‡āđ€āļžāļŠāļĢPOLYGON ((99.97734 16.11070, 99.97546 16.10861...NoneNoneclass
..................
71āđ€āļžāļŠāļĢāļšāļļāļĢāļĩPOLYGON ((100.02689 12.91666, 100.02690 12.916...NoneNoneclass
72āđ€āļžāļŠāļĢāļšāļđāļĢāļ“āđŒPOLYGON ((101.30859 15.57351, 101.30821 15.566...NoneNoneclass
73āđ€āļĨāļĒPOLYGON ((102.01428 17.14017, 102.01439 17.140...NoneNoneclass
74āđāļžāļĢāđˆPOLYGON ((99.64157 18.05575, 99.64237 18.05561...NoneNoneclass
75āđāļĄāđˆāļŪāđˆāļ­āļ‡āļŠāļ­āļ™POLYGON ((98.16045 18.15059, 98.16069 18.15037...NoneNoneclass

76 rows × 5 columns

Viz prep

## make total_bound (background outline)
## and extend (so the canvas would center at the same point)
## also, remember the PROVINCE dataset from the start? we're going to use that

province["class"] = "class"  # a dummy column so it would dissolve the whole dataset
boundary = province.dissolve(by="class")
extent = boundary.total_bounds
## set font (default matplotlib font can't render Thai)
plt.rcParams["font.family"] = "Tahoma"

Cleaning it up

There are some degree of Pali-Sanskrit influence in Thai, in which the word order is different, so it is possible for a certain *fix to appear as either prefix or suffix. it's like repeat and dore (for redo)

## â›Đâ›Đâ›Đ rerun from this cell onward if you want to change *fix â›Đâ›Đâ›Đ
## filter null *fix
_fix_column = "suffix"  # â›Đâ›Đâ›Đ change here â›Đâ›Đâ›Đ
df_temp = df
df_temp = df_temp[df_temp[_fix_column].notnull()]

## get count
df_temp["{}_count".format(_fix_column)] = df_temp[_fix_column].map(
    df_temp[_fix_column].value_counts().to_dict()
)
## at the largest region level it won't be much, but at a smaller level like subdistrict
## having a single *fix for the entire dataset can happen, hence we should filter it out

## filter for a *fix you want to visualize
viz_categ_count_column = "{}_count".format(_fix_column)

## â›Đâ›Đâ›Đ use the second line if you want to set the threshold with median â›Đâ›Đâ›Đ
threshold = 0
## threshold = df_temp[viz_categ_count_column].median()

df_temp = df_temp[df_temp[viz_categ_count_column] >= threshold]
df_temp
namegeometryprefixsuffixclasssuffix_count
1āļāļĢāļļāļ‡āđ€āļ—āļžāļĄāļŦāļēāļ™āļ„āļĢPOLYGON ((100.51756 13.66185, 100.51754 13.661...Noneāļ™āļ„āļĢclass2
25āļ›āļ—āļļāļĄāļ˜āļēāļ™āļĩPOLYGON ((100.91417 13.95445, 100.91415 13.952...Noneāļ˜āļēāļ™āļĩclass5
48āļŠāļāļĨāļ™āļ„āļĢPOLYGON ((104.36246 17.09941, 104.36248 17.099...Noneāļ™āļ„āļĢclass2
58āļŠāļļāļĢāļēāļĐāļŽāļĢāđŒāļ˜āļēāļ™āļĩMULTIPOLYGON (((99.20865 8.33715, 99.20647 8.3...āļŠāļļāļ˜āļēāļ™āļĩclass5
64āļ­āļļāļ”āļĢāļ˜āļēāļ™āļĩPOLYGON ((103.44196 17.21428, 103.44246 17.214...Noneāļ˜āļēāļ™āļĩclass5
66āļ­āļļāļ—āļąāļĒāļ˜āļēāļ™āļĩPOLYGON ((100.04080 15.29612, 100.04067 15.296...Noneāļ˜āļēāļ™āļĩclass5
67āļ­āļļāļšāļĨāļĢāļēāļŠāļ˜āļēāļ™āļĩPOLYGON ((105.55486 14.95406, 105.55414 14.953...Noneāļ˜āļēāļ™āļĩclass5

Viz

import os

key_column = _fix_column
key_name = "province"  # â›Đâ›Đâ›Đ set region type here #
key_count_column = "{}_count".format(key_column)

out_dir = "plot/{}_{}".format(key_name, key_column)
os.makedirs(out_dir, exist_ok=True)

gdf = df_temp
for key in gdf[key_column].unique():
    ax = gplt.polyplot(boundary, figsize=(10, 15))

    query = gdf[gdf[key_column] == key]
    total_records = str(int(query[key_count_column].tolist()[0]))

    gplt.polyplot(query, ax=ax, extent=extent, edgecolor="black", facecolor="green")

    plt.title("{}: {} records".format(key, total_records))

    plt.savefig("{}/{}_{}.png".format(out_dir, str(total_records).zfill(3), key))
##     break

Output structure

Some interesting outputs (at subdistrict level)

Northern region

You can see that the prefix "āđāļĄāđˆ" concentrates around the northern region.

Eastern region

"āđ‚āļ™āļ™" seems to be specific to the eastern seeing it's clustered around the eastern part of the country.

Multi-region

As expected, "āļšāļēāļ‡" is clustered around the central region, no surprise here since the old name of Thailand's capital (it's located in the central region) is "āļšāļēāļ‡āļāļ­āļ." But you can see that it's clustered around the southern parts as well.