Extracting Structured Data From Wikipedia

Extracting Structured Data from Unstructured Wikipedia Sites.

This project we develop a tool to extract structured wikipedia tables as csv or json given a wikipedia website link.

Useful for data mining.

#!/usr/bin/env python
# echo:true
#"""Wikipedia tables extractor."""
from bs4 import BeautifulSoup
import bs4
import requests
import itertools
import pandas as pd
import numpy as np
import re
import sys
import argparse
from typing import List, Optional, Dict, Tuple, Iterable
from collections import OrderedDict

# echo:true
def get_tables(wikipedia_url):
    """Use BeautifulSoup to get a list of table elements of class wikitable."""
    wikipedia_url = wikipedia_url
    page = requests.get(wikipedia_url)
    soup = BeautifulSoup(page.content, "lxml")
    tables = soup.find_all("table", {"class", "table"})  # or wikitable
    wiki_tables = soup.find_all("table", {"class": "wikitable"})
    tables = wiki_tables if len(wiki_tables) > 0 else tables
    # table = tables[table_from_top - 1]
    return tables

class TH:
    """Table Column Header.

    Represents a column header in an html table <th>.

    :param text: (str) The text of the header i.e the column name.
    :param index: (int) The index of the header i.e the column.
    :param level: (int) The level of the header.
    :param rowspan: (int) The number of rows the header spans.
    :param colspan: (int) The number of columns the header spans.
    """

    def __init__(
        self,
        text: str,
        index: int,
        level: int = 0,
        rowspan: int = 1,
        colspan: int = 1,
        parent: Optional["TH"] = None,
    ):
        self.text = text
        self.index = index
        self.level = level
        self.rowspan = rowspan
        self.colspan = colspan
        self.parent = parent
        self.children: List[TH] = []
        self.rowspan_child: Optional[TH] = None
        if self.rowspan > 1:
            self.set_rowspan_child(
                TH(
                    self.text,
                    self.index,
                    level + 1,
                    self.rowspan - 1,
                    self.colspan,
                )
            )
        if parent is not None:
            parent.add_child(self)

    def set_rowspan_child(self, child: "TH"):
        """Set the rowspan child of the header."""
        self.rowspan_child = child

    def add_child(self, child: "TH"):
        """Add a child to the header."""
        child.parent = self
        self.children.append(child)

    def set_parent(self, parent: "TH"):
        """Set the parent of the header."""
        self.parent = parent

    @property
    def expanded(self) -> List["TH"]:
        """Get the expanded version of the header.

        The expanded version of the column header is a list of headers
        that are expanded from the current header if it spans multiple columns.

        :return: (List[TH]) The expanded version of the header.
        """
        return [
            TH(
                self.text,
                self.index + i,
                self.level,
                self.rowspan,
                1,
                self.parent,
            )
            for i in range(self.colspan)
        ]

    def __repr__(self):
        return self.text

    def __str__(self):
        return self.text

class LevelRow:
    """Represents a Row of html table headers(<th>).

    :param level: (int) The level of the row.
    :param children: (List[TH]) The children of the row.
    """

    def __init__(self, level, children: List[TH]):
        self.level = level
        self.children: List[TH] = children

    @property
    def expanded(self) -> List[TH]:
        """Get the expanded version of the row.

        The expanded version of the row is a list of headers
        that are expanded from the current row if it spans multiple rows.

        :return: (List[TH]) The expanded version of the row.
        """
        expanded = []
        for child in self.children:
            expanded.extend(child.expanded)
        return expanded

    def expanded_text(self) -> List[str]:
        """Return a list of the feature names in the expanded row."""
        return [child.text for child in self.expanded]

    def rowspan_exists_at(self, index: int) -> bool:
        """Check if the row at index spans multiple rows."""
        if index >= len(self.children):
            return False
        return self.expanded[index].rowspan > 1

    def get_rowspan_child_at(self, index: int) -> Optional[TH]:
        """Get the child of the header that spans multiple rows at the index.

        The rowspan child at the index is the header that is expanded from the
        rowspan at the index with the rowspan attribute less one.
        :param index: (int) The index of the rowspan.
        :return: (TH) The rowspan child of the rowspan at the index.
        :raises AssertionError: If a rowspan does not exist at the index.
        """
        assert self.rowspan_exists_at(
            index
        ), "No Column spanning multiple rows exists at the index"
        return self.expanded[index].rowspan_child

class WikiTableParser:
    """Wikipedia Table Parser.

    Parses a wikipedia table and extracts the data from it.
    :param table_soup: (bs4.element.Tag) The soup of the table.
    """

    _float_regexp = re.compile(
        r"^[-+]?(?:\b[0-9]+(?:\.[0-9]*)?|\.[0-9]+\b)(?:[eE][-+]?[0-9]+\b)?$"
    ).match

    def __init__(self, table_soup: bs4.element.Tag):
        self.table_soup = table_soup
        self.all_rows = self.table_soup.find_all("tr")
        self.num_rows = len(self.all_rows)
        self.header_rows = self.find_header_rows()
        self.num_hrows = len(self.header_rows)
        self.per_level_headers: Dict[int, Optional[LevelRow]] = OrderedDict(
            {i: None for i in range(self.num_hrows)}
        )
        self.num_data_rows = self.num_rows - self.num_hrows
        self.feature_tuples = self.get_feature_tuples()
        self.num_cols = len(self.feature_tuples)
        self.feature_names = self.get_feature_names()
        self.table_values = self.get_table_values()
        self.table_data = self.get_table_data()

    @staticmethod
    def is_float_re(str) -> bool:
        """Check if a string is a floating point number."""
        return True if WikiTableParser._float_regexp(str) else False

    @staticmethod
    def remove_duplicates(iterable: Iterable) -> Tuple:
        """Remove duplicates from an iterable.

        Useful when creating a feature name from a tuple.
        The tuple will only contain unique elements and may be used in creating
        a pandas.MultiIndex from tuples.
        """
        after = []
        for i in iterable:
            if i not in after:
                after.append(i)
            else:
                after.append("")
        return tuple(after)

    @staticmethod
    def remove_bracs_parens(string: str, include_parens: bool = False) -> str:
        """Remove brackets and parentheses from a string.

        Also removes newlines and any extra whitespaces longer than 1.
        """
        if include_parens:
            return re.sub(r"\n|(\s+){2,20}|\[.*\]", "", string)
        return re.sub(r"\n|(\s+){2,20}|\(.*\)|\[.*\]", "", string)

    @staticmethod
    def remove_wiki_refs(string: str) -> str:
        """Remove any wikipedia references from a string."""
        return re.sub(r"\[(\d+)\]|\[(\w+)\]", "", string)

    @staticmethod
    def get_feature_name(
        header: bs4.element.Tag,
        include_parens: bool = True,
    ) -> str:
        """Get Feature Name.

        The feature name is the text of the header element with leading and
        trailing whitespace removed together with any wikipedia references.

        :param header: (bs4.element.Tag) The header element.
        :param include_parens: (bool) Whether to include parentheses.
        :return: (str) The feature name.
        """
        feature_name = "".join(header.find_all(string=True))
        feature_name.replace(r"\n", " ")
        lspaces = sum(1 for _ in itertools.takewhile(
            str.isspace, feature_name))
        feature_name = feature_name[lspaces:]
        feature_name = WikiTableParser.remove_bracs_parens(
            feature_name,
            include_parens,
        )
        feature_name = WikiTableParser.remove_wiki_refs(feature_name)
        return feature_name.strip()

    @property
    def has_defined_thead(self) -> bool:
        """Check if the table has a defined thead element."""
        return self.table_soup.find("thead") is not None

    @property
    def shape(self) -> Tuple[int, int]:
        """Return the (number of rows, number of columns) of the table."""
        return (self.num_data_rows, self.num_cols)

    @property
    def table_caption(self) -> Optional[str]:
        """Get Table Caption if present."""
        caption = self.table_soup.find("caption")
        if caption is not None:
            return caption.text.strip()
        else:
            return None

    def find_header_rows(self) -> List[bs4.element.Tag]:
        """Find Header Rows.

        Returns a list of the header rows in the table.
        """
        rows_until_first_td = []
        for row in self.all_rows:
            if row.find("td") is not None:
                break
            elif row.find("th") is not None:
                rows_until_first_td.append(row)
        return rows_until_first_td

    @property
    def zipped_th_tuples(self) -> List[Tuple[bs4.element.Tag]]:
        headers = []
        for i, plh in self.per_level_headers.items():
            headers.append(plh.expanded) if plh is not None else None
        return list(zip(*headers))

    def get_feature_tuples(self) -> List[Tuple[str]]:
        """Return a List of feature tuples.

        The feature tuples are tuples of the feature names for each row in the
        table header in the order they are found in the table.
        """
        self._process_headers()
        headers = []
        for i, plh in self.per_level_headers.items():
            headers.append(plh.expanded_text()) if plh is not None else None
        return list(zip(*headers))

    def get_feature_names(self) -> List[Tuple[str]]:
        """Get Feature Names.

        Returns a list of feature names in the table normalized to be usable
        and pandas DataFrame multi-indexes.
        """
        return [self.remove_duplicates(t) for t in self.feature_tuples]

    @property
    def get_table_headers(self) -> List[Tuple[str]]:
        """Return a list of feature_names or the headers of a table."""
        return self.feature_names

    def get_table_data(self) -> List[Dict[Tuple[str], str]]:
        """Get Table Data.

        Returns a list of dict objects key value pairs of
        [{feature_name[i] :feature_value[i],...},...].
        """
        samples = []
        feature_names = self.feature_names
        sample_rows = self.table_values
        for features in sample_rows:
            samples.append(dict(zip(feature_names, features)))
        return samples

    def get_table_values(self) -> List[List[str]]:
        """Return the Feature values of a table element."""
        values = []
        sample_rows = self.table_soup.find_all("tr")
        for sample_row in sample_rows:
            features = []
            for feature_col in sample_row.find_all("td"):
                n_spans = int(feature_col.attrs.get("colspan", 1))
                text = self.get_feature_name(feature_col, include_parens=False)
                if "," in text:
                    x = text.replace(",", "").replace(" ", "")
                    if (
                        x.isnumeric()
                        or x.isdecimal()
                        or x.isdigit()
                        or self.is_float_re(x)
                        or x == ""                        
                        or (x.endswith("%") and self.is_float_re(x.replace("%","")))
                    ):
                        for _ in range(n_spans):
                            features.append(x)
                    else:
                        for _ in range(n_spans):
                            features.append(f"'{text}'")
                else:
                    for _ in range(n_spans):
                        features.append(text)
            if len(features) > 0:
                values.append(features)
        return values

    def _process_headers(self):
        """Process Headers.

        Extracts the headers from the table and sets the per_level_headers.
        """
        for level, header_row in enumerate(self.header_rows):
            level_headers = []
            for header in header_row.find_all("th"):
                feature_name = self.get_feature_name(header)
                rowspan = int(header.attrs.get("rowspan", 1))
                colspan = int(header.attrs.get("colspan", 1))
                index = sum([lh.colspan for lh in level_headers])
                if level > 0:
                    prev_lh = self.per_level_headers[level - 1]
                    prev_expanded = prev_lh.expanded
                    while prev_lh.rowspan_exists_at(index):
                        level_headers.append(
                            prev_lh.get_rowspan_child_at(index),
                        )
                        index += 1
                    parent = (
                        prev_expanded[index] if index < len(
                            prev_expanded) else None
                    )
                    th = TH(feature_name, index, level,
                            rowspan, colspan, parent)
                    level_headers.extend(th.expanded)
                    fdx = th.expanded[-1].index + 1
                    while prev_lh.rowspan_exists_at(fdx):
                        level_headers.append(prev_lh.get_rowspan_child_at(fdx))
                        fdx += 1
                else:
                    th = TH(feature_name, index, level, rowspan, colspan, None)
                    level_headers.extend(th.expanded)
            self.per_level_headers[level] = LevelRow(level, level_headers)

    @property
    def as_pandas_df(self, with_headers=True) -> pd.DataFrame:
        """Return the table as a pandas DataFrame.

        :param with_headers: (bool) Include the headers in the DataFrame.
        :return: (pd.DataFrame) The table as a pandas DataFrame.
        """
        if not with_headers:
            return pd.DataFrame(self.table_values).dropna(axis=0, how="all")
        df = pd.DataFrame(self.table_data).dropna(axis=0, how="all")
        columns = df.columns
        tup_cols = [c for c in columns if isinstance(c, tuple)]
        if len(tup_cols) == 0:
            return df
        if len(tup_cols) == len(columns):
            df.columns = pd.MultiIndex.from_tuples(tup_cols)
        return df

    def print_headers_debug(self):
        """Print debug information about the headers of the table."""
        for i in range(self.num_hrows):
            lh = self.per_level_headers[i]
            row = lh.expanded_text()
            print(row, "len:", len(row))
        print(self.zipped_th_tuples)

def print_table_headers(table_headers):
    """Print a formatted table feature_names or headers."""
    print(
        ",".join([str(i) for i in table_headers])
        .replace("\n", "")
        .replace(" ,", ",")
        .replace("  ", " ")
    )

def print_table_values(table_values):
    """Print a formatted table feature_values or rows."""
    for row in table_values[:5]:
        print(",".join([i if i else "" for i in row]).replace("\n", ""))


def list_page_tables(tables: List[WikiTableParser], verbose=False):
    """Print all headers of all tables in tables."""
    ps=[]
    for i, table in enumerate(tables):
        print(f"\n{'==='*13}> Table {i+1}")
        ps_table = WikiTableParser(table)
        print("table caption: ", ps_table.table_caption)
        print_table_headers(ps_table.get_table_headers)
        print_table_values(ps_table.table_values)
        ps_table.print_headers_debug() if verbose else None
        print("===" * 13)
        ps.append(ps_table)
    return ps

def save_df_to_json(df, filename):
    """Save the pd.DataFrame to a CSV file with the given filename."""
    return df.to_json(filename, index=False, orient="records")


def save_df_to_csv(df, filename):
    """Save the pd.DataFrame to a CSV file with the given filename."""
    return df.to_csv(filename, index=False)


def get_filename_from_wikipedia_url(url, ext=".csv"):
    """Get filename from wikipedia_url."""
    return url.split("/")[-1] + ext

def run_table_extraction(
    wikipedia_url,
    save_to_csv=False,
    save_all=False,
    save_to_json=False,
    save_all_json=False,
    table_number=1,
    outf=None,
    exclude_column=None,
    **kwargs,
):
    args = argparse.Namespace(
        wikipedia_url=wikipedia_url,
        save_to_csv=save_to_csv,
        save_all=save_all,
        save_to_json=save_to_json,
        save_all_json=save_all_json,
        N=table_number,
        outf=outf,
        exclude_col=exclude_column,
    )
    tables = get_tables(args.wikipedia_url)
    ps_tables = [WikiTableParser(table) for table in tables]
    if not len(tables):
        print("0 Tables Found!\tExiting...")
        sys.exit(0)
    if args.save_all:
        if not args.outf:
            filename = get_filename_from_wikipedia_url(args.wikipedia_url)
        else:
            filename = args.outf
        for i in range(len(tables)):
            table = ps_tables[i]
            df = table.as_pandas_df
            cols = df.columns
            if args.exclude_col:
                col = cols[args.exclude_col - 1]
                cols = [c for c in cols if c != col]
            save_to_csv(df[cols], filename[:-4] + str(i) + ".csv")
    if args.save_all_json:
        if not args.outf:
            filename = get_filename_from_wikipedia_url(
                args.wikipedia_url,
                ext=".json",
            )
        else:
            filename = args.outf
        for i in range(len(tables)):
            table = ps_tables[i]
            df = table.as_pandas_df
            cols = df.columns
            if args.exclude_col:
                col = cols[args.exclude_col - 1]
                cols = [c for c in cols if c != col]
            save_to_json(df[cols], filename[:-4] + str(i) + ".json")
    if args.save_to_csv:
        N = int(args.N)
        table = ps_tables[N - 1]
        if not args.outf:
            filename = get_filename_from_wikipedia_url(args.wikipedia_url)
        else:
            filename = args.outf
        df = table.as_pandas_df
        cols = df.columns
        if args.exclude_col:
            col = cols[args.exclude_col - 1]
            cols = [c for c in cols if c != col]
        save_to_csv(df[cols], filename)
    else:
        filename = get_filename_from_wikipedia_url(args.wikipedia_url)
        list_page_tables(tables)
        print("\n")
        print(f"Default Output Filename: {filename}")
        dfs = []
        for i in range(len(tables)):
            table = ps_tables[i]
            df = table.as_pandas_df
            cols = df.columns
            if args.exclude_col:
                col = cols[args.exclude_col - 1]
                cols = [c for c in cols if c != col]
            dfs.append(df[cols])
        return dfs,ps_tables
    return [p.as_pandas_df for p in ps_tables], ps_tables

Testing

# echo:true
dfs,ts = run_table_extraction(
    wikipedia_url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
    save_to_csv=False,
    save_all=False,
    save_to_json=False,
    save_all_json=False,
    table_number=1,
    outf=None,
    exclude_column=None,
)

df=dfs[0]
lcs=list(df.columns)[1:]
yrs=[l for l in lcs if l[1]=='Year']
vals=[x for x in lcs if x not in yrs]
df[vals]=df[vals].replace({"—":np.nan,})

=======================================> Table 1
table caption:  GDP (million US$) by country
('Country/Territory', ''),('IMF', 'Forecast'),('IMF', 'Year'),('World Bank', 'Estimate'),('World Bank', 'Year'),('United Nations', 'Estimate'),('United Nations', 'Year')
World,109529216,2024,105435540,2023,100834796,2022
United States,28781083,2024,27360935,2023,25744100,2022
China,18532633,2024,17794782,2023,17963170,2022
Germany,4591100,2024,4456081,2023,4076923,2022
Japan,4110452,2024,4212945,2023,4232173,2022
=======================================


Default Output Filename: List_of_countries_by_GDP_(nominal).csv

# echo:true
df.shape

(210, 7)

Nominal GDP in (million US$)

df = df.astype({('IMF','Year'):str,('World Bank', 'Year'):str,('United Nations', 'Year'):str})
df = df.astype({t:np.float32 for t in vals})
df.dtypes
df.describe(include=["object"])
df.describe()

Country/Territory               object
IMF                Forecast    float32
                   Year         object
World Bank         Estimate    float32
                   Year         object
United Nations     Estimate    float32
                   Year         object
dtype: object

| | Country/Territory | IMF | World Bank | United Nations | |--------|-------------------|------|------------|----------------| | | | Year | Year | Year | | count | 210 | 210 | 210 | 210 | | unique | 210 | 4 | 4 | 2 | | top | World | 2024 | 2023 | 2022 | | freq | 1 | 190 | 186 | 209 |

| | IMF | World Bank | United Nations | |-------|----------------|----------------|----------------| | | Forecast | Estimate | Estimate | | count | 195.00 | 202.00 | 209.00 | | mean | 1,123,227.75 | 1,037,206.12 | 961,269.88 | | std | 8,200,460.50 | 7,751,885.00 | 7,293,077.00 | | min | 66.00 | 62.00 | 59.00 | | 25% | 13,276.00 | 9,284.00 | 8,772.00 | | 50% | 46,790.00 | 37,573.00 | 31,717.00 | | 75% | 297,844.50 | 259,432.25 | 237,101.00 | | max | 109,529,216.00 | 105,435,536.00 | 100,834,800.00 |

GDP (million US$) by country

Top 10

# echo:true
df.head(10)

| | Country/Territory | IMF | | World Bank | | United Nations | | |-----|-------------------|----------------|------|----------------|------|----------------|------| | | | Forecast | Year | Estimate | Year | Estimate | Year | | 0 | World | 109,529,216.00 | 2024 | 105,435,536.00 | 2023 | 100,834,800.00 | 2022 | | 1 | United States | 28,781,084.00 | 2024 | 27,360,936.00 | 2023 | 25,744,100.00 | 2022 | | 2 | China | 18,532,632.00 | 2024 | 17,794,782.00 | 2023 | 17,963,170.00 | 2022 | | 3 | Germany | 4,591,100.00 | 2024 | 4,456,081.00 | 2023 | 4,076,923.00 | 2022 | | 4 | Japan | 4,110,452.00 | 2024 | 4,212,945.00 | 2023 | 4,232,173.00 | 2022 | | 5 | India | 3,937,011.00 | 2024 | 3,549,919.00 | 2023 | 3,465,541.00 | 2022 | | 6 | United Kingdom | 3,495,261.00 | 2024 | 3,340,032.00 | 2023 | 3,089,072.00 | 2022 | | 7 | France | 3,130,014.00 | 2024 | 3,030,904.00 | 2023 | 2,775,316.00 | 2022 | | 8 | Brazil | 2,331,391.00 | 2024 | 2,173,666.00 | 2023 | 1,920,095.00 | 2022 | | 9 | Italy | 2,328,028.00 | 2024 | 2,254,851.00 | 2023 | 2,046,952.00 | 2022 |

GDP (million US$) by country

Bottom 10

# echo:true
df.tail(10)

| | Country/Territory | IMF | | World Bank | | United Nations | | |-----|-----------------------|----------|------|------------|------|----------------|------| | | | Forecast | Year | Estimate | Year | Estimate | Year | | 200 | Samoa | 1,024.00 | 2024 | 934.00 | 2023 | 857.00 | 2022 | | 201 | São Tomé and Príncipe | 751.00 | 2024 | 603.00 | 2023 | 546.00 | 2022 | | 202 | Dominica | 708.00 | 2024 | 654.00 | 2023 | 612.00 | 2022 | | 203 | Tonga | 581.00 | 2024 | 500.00 | 2022 | 488.00 | 2022 | | 204 | Micronesia | 484.00 | 2024 | 460.00 | 2023 | 427.00 | 2022 | | 205 | Kiribati | 311.00 | 2024 | 279.00 | 2023 | 223.00 | 2022 | | 206 | Palau | 308.00 | 2024 | 263.00 | 2023 | 225.00 | 2022 | | 207 | Marshall Islands | 305.00 | 2024 | 284.00 | 2023 | 279.00 | 2022 | | 208 | Nauru | 161.00 | 2024 | 154.00 | 2023 | 147.00 | 2022 | | 209 | Tuvalu | 66.00 | 2024 | 62.00 | 2023 | 59.00 | 2022 |

# echo:true
df[df['Country/Territory']=='Kenya']

| | Country/Territory | IMF | | World Bank | | United Nations | | |-----|-------------------|------------|------|------------|------|----------------|------| | | | Forecast | Year | Estimate | Year | Estimate | Year | | 70 | Kenya | 104,001.00 | 2024 | 107,441.00 | 2023 | 113,419.00 | 2022 |

Nominal GDP Per Capita in USD

dfs2,ts2=run_table_extraction(
    wikipedia_url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita",
    save_to_csv=False,
    save_all=False,
    save_to_json=False,
    save_all_json=False,
    table_number=1,
    outf=None,
    exclude_column=None,
)
#len(dfs2)
df2=dfs2[0]
lcs2=list(df2.columns)[1:]
yrs2=[l for l in lcs2 if l[1]=='Year']
vals2=[x for x in lcs2 if x not in yrs2]
df2[vals2]=df2[vals2].replace({"—":np.nan,})

=======================================> Table 1
table caption:  GDP (in USD) per capita by country, territory, non-sovereign state or non-IMF member
('Country/Territory', ''),('IMF', 'Estimate'),('IMF', 'Year'),('World Bank', 'Estimate'),('World Bank', 'Year'),('United Nations', 'Estimate'),('United Nations', 'Year')
Monaco,—,—,240862,2022,234317,2021
Liechtenstein,—,—,187267,2022,169260,2021
Luxembourg,131384,2024,128259,2023,133745,2021
Bermuda,—,—,123091,2022,112653,2021
Ireland,106059,2024,103685,2023,101109,2021
=======================================


Default Output Filename: List_of_countries_by_GDP_(nominal)_per_capita.csv

# echo:true
df2.shape

(222, 7)

df2 = df2.astype({t:np.float32 for t in vals2})
df2 = df2.astype({t:str for t in yrs2})
df2.dtypes
df2.describe(include=["object"])
df2.describe()

Country/Territory               object
IMF                Estimate    float32
                   Year         object
World Bank         Estimate    float32
                   Year         object
United Nations     Estimate    float32
                   Year         object
dtype: object

| | Country/Territory | IMF | World Bank | United Nations | |--------|-------------------|------|------------|----------------| | | | Year | Year | Year | | count | 222 | 222 | 222 | 222 | | unique | 222 | 4 | 8 | 3 | | top | Monaco | 2024 | 2023 | 2021 | | freq | 1 | 190 | 188 | 212 |

| | IMF | World Bank | United Nations | |-------|------------|------------|----------------| | | Estimate | Estimate | Estimate | | count | 195.00 | 216.00 | 213.00 | | mean | 18,333.29 | 21,421.44 | 18,584.14 | | std | 23,901.34 | 31,290.97 | 29,127.32 | | min | 230.00 | 200.00 | 302.00 | | 25% | 2,579.00 | 2,524.00 | 2,306.00 | | 50% | 7,327.00 | 8,257.00 | 6,785.00 | | 75% | 24,080.00 | 29,269.00 | 21,390.00 | | max | 131,384.00 | 240,862.00 | 234,317.00 |

GDP (in USD) per capita by country, territory, non-sovereign state or non-IMF member

Top 10

# echo:true
df2.head(10)

| | Country/Territory | IMF | | World Bank | | United Nations | | |-----|-------------------|------------|------|------------|------|----------------|------| | | | Estimate | Year | Estimate | Year | Estimate | Year | | 0 | Monaco | NaN | — | 240,862.00 | 2022 | 234,317.00 | 2021 | | 1 | Liechtenstein | NaN | — | 187,267.00 | 2022 | 169,260.00 | 2021 | | 2 | Luxembourg | 131,384.00 | 2024 | 128,259.00 | 2023 | 133,745.00 | 2021 | | 3 | Bermuda | NaN | — | 123,091.00 | 2022 | 112,653.00 | 2021 | | 4 | Ireland | 106,059.00 | 2024 | 103,685.00 | 2023 | 101,109.00 | 2021 | | 5 | Switzerland | 105,669.00 | 2024 | 99,995.00 | 2023 | 93,525.00 | 2021 | | 6 | Cayman Islands | NaN | — | 96,074.00 | 2022 | 85,250.00 | 2021 | | 7 | Norway | 94,660.00 | 2024 | 87,962.00 | 2023 | 89,242.00 | 2021 | | 8 | Isle of Man | NaN | — | 94,124.00 | 2021 | NaN | — | | 9 | Singapore | 88,447.00 | 2024 | 84,734.00 | 2023 | 66,822.00 | 2021 |

GDP (in USD) per capita by country, territory, non-sovereign state or non-IMF member

Bottom 10

# echo:true
df2.tail(10)

| | Country/Territory | IMF | | World Bank | | United Nations | | |-----|--------------------------|----------|------|------------|------|----------------|------| | | | Estimate | Year | Estimate | Year | Estimate | Year | | 212 | Sudan | 547.00 | 2024 | 2,272.00 | 2023 | 786.00 | 2021 | | 213 | Madagascar | 538.00 | 2024 | 529.00 | 2023 | 500.00 | 2021 | | 214 | Central African Republic | 538.00 | 2024 | 445.00 | 2023 | 461.00 | 2021 | | 215 | Sierra Leone | 527.00 | 2024 | 433.00 | 2023 | 505.00 | 2021 | | 216 | Yemen | 486.00 | 2024 | 533.00 | 2023 | 302.00 | 2021 | | 217 | Malawi | 481.00 | 2024 | 673.00 | 2023 | 613.00 | 2021 | | 218 | South Sudan | 422.00 | 2024 | 1,072.00 | 2015 | 400.00 | 2021 | | 219 | Afghanistan | 422.00 | 2022 | 353.00 | 2022 | 373.00 | 2021 | | 220 | Syria | NaN | — | 421.00 | 2021 | 925.00 | 2021 | | 221 | Burundi | 230.00 | 2024 | 200.00 | 2023 | 311.00 | 2021 |

# echo:true
df2[df2['Country/Territory']=='Kenya']

| | Country/Territory | IMF | | World Bank | | United Nations | | |-----|-------------------|----------|------|------------|------|----------------|------| | | | Estimate | Year | Estimate | Year | Estimate | Year | | 177 | Kenya | 1,983.00 | 2024 | 1,950.00 | 2023 | 2,082.00 | 2021 |

# echo:true
df2[df2['Country/Territory']=='United States']

| | Country/Territory | IMF | | World Bank | | United Nations | | |-----|-------------------|-----------|------|------------|------|----------------|------| | | | Estimate | Year | Estimate | Year | Estimate | Year | | 10 | United States | 85,373.00 | 2024 | 81,695.00 | 2023 | 69,185.00 | 2021 |

List of countries by minimum wage

# echo:true
dfs3,ts3 = run_table_extraction(
    wikipedia_url="https://en.wikipedia.org/wiki/List_of_countries_by_minimum_wage",
)

=======================================> Table 1
table caption:  Minimum wages by country. USD and Int$ PPP.
('Country', ''),('Minimum wage', ''),('Annual', 'Nominal (US$)'),('Annual', 'PPP (Int$)'),('Workweek (hours)', ''),('Hourly', 'Nominal (US$)'),('Hourly', 'PPP (Int$)'),('Percent of GDP per capita', ''),('Effective per', '')
Afghanistan,'؋5,500 . There was no minimum wage for permanent workers in the private sector.',858,3272,40,0.41,1.57,168.3%,2017
Albania,'L40,000 . The law establishes a 40-hour workweek, but the actual workweek is typically set by individual or collective-bargaining agreement.',4637,8697,40,2.23,4.18,75.4%,1 Apr 2023
Algeria,'د.ج 20,000 .',1777,6247,40,0.85,3,41.6%,1 May 2020
Andorra,€7.42  hourly.,18253,13493,40,8,6,28%,1 Jan 2023
Angola,'Kz 32,181',663,3161,44,0.29,1.38,49%,2022
=======================================

=======================================> Table 2
table caption:  OECD Real minimum wages (US dollar)[256]
('Country', '', ''),('2018', 'Nominal', 'Annual'),('2018', 'Nominal', 'Hourly'),('2018', 'PPP', 'Annual'),('2018', 'PPP', 'Hourly'),('2018', 'Annual workinghours', ''),('2019', 'Nominal', 'Annual'),('2019', 'Nominal', 'Hourly'),('2019', 'PPP', 'Annual'),('2019', 'PPP', 'Hourly'),('2019', 'Annual workinghours', '')
Australia,25970.8,13.1,24481.2,12.4,1976,26388.5,13.4,24874.9,12.6,1976
Belgium,21293.0,10.2,22746.8,10.9,2086,21410.8,10.3,22872.6,11.0,2086
Canada,20552.5,9.9,20946.0,10.1,2080,20880.7,10.0,21280.5,10.2,2080
Chile,4902.5,2.1,7044.4,3.0,2346,5101.7,2.2,7330.7,3.1,2346
Colombia,3451.3,1.2,7677.4,2.6,2920,3533.7,1.2,7860.9,2.7,2920
=======================================


Default Output Filename: List_of_countries_by_minimum_wage.csv

dfa=dfs3[0]

valsf=[("Annual","Nominal (US$)"),("Annual","PPP (Int$)"),("Hourly","Nominal (US$)"),("Hourly","PPP (Int$)"),]
isf=[("Workweek (hours)","")]
dfa[valsf]=dfa[valsf].replace({"":np.nan,})
dfa[isf]=dfa[isf].replace({"":np.nan})
dfa = dfa.astype({t:np.float64 for t in valsf})
dfa = dfa.astype({t:np.float64 for t in isf})

# echo:true
dfa.shape
dfa.dtypes
dfa.describe()
#dfa.describe(include="object")

(201, 9)

Country                                      object
Minimum wage                                 object
Annual                     Nominal (US$)    float64
                           PPP (Int$)       float64
Workweek (hours)                            float64
Hourly                     Nominal (US$)    float64
                           PPP (Int$)       float64
Percent of GDP per capita                    object
Effective per                                object
dtype: object

| | Annual | | Workweek (hours) | Hourly | | |-------|----------------|-------------|------------------|----------------|-------------| | | Nominal (US\$) | PPP (Int\$) | | Nominal (US\$) | PPP (Int\$) | | count | 159.00 | 156.00 | 190.00 | 158.00 | 155.00 | | mean | 6,387.95 | 11,044.29 | 42.58 | 3.11 | 4.92 | | std | 8,312.29 | 26,378.09 | 3.60 | 4.17 | 10.72 | | min | 0.00 | 1.00 | 35.00 | 0.00 | 0.00 | | 25% | 969.00 | 3,006.25 | 40.00 | 0.45 | 1.30 | | 50% | 3,009.00 | 6,758.00 | 40.00 | 1.36 | 3.00 | | 75% | 7,973.00 | 13,085.75 | 45.00 | 4.03 | 5.85 | | max | 36,685.00 | 322,362.00 | 52.00 | 18.12 | 129.15 |

# echo:true
dfa.head(10)

| | Country | Minimum wage | Annual | | Workweek (hours) | Hourly | | Percent of GDP per capita | Effective per | |-----|---------------------|---------------------------------------------------|----------------|-------------|------------------|----------------|-------------|---------------------------|------------------| | | | | Nominal (US\$) | PPP (Int\$) | | Nominal (US\$) | PPP (Int\$) | | | | 0 | Afghanistan | '؋5,500 . There was no minimum wage for perman... | 858.00 | 3,272.00 | 40.00 | 0.41 | 1.57 | 168.3% | 2017 | | 1 | Albania | 'L40,000 . The law establishes a 40-hour workw... | 4,637.00 | 8,697.00 | 40.00 | 2.23 | 4.18 | 75.4% | 1 Apr 2023 | | 2 | Algeria | 'د.ج 20,000 .' | 1,777.00 | 6,247.00 | 40.00 | 0.85 | 3.00 | 41.6% | 1 May 2020 | | 3 | Andorra | €7.42 hourly. | 18,253.00 | 13,493.00 | 40.00 | 8.00 | 6.00 | 28% | 1 Jan 2023 | | 4 | Angola | 'Kz 32,181' | 663.00 | 3,161.00 | 44.00 | 0.29 | 1.38 | 49% | 2022 | | 5 | Antigua and Barbuda | EC\$8.2 per hour . | 6,317.00 | 7,788.00 | 40.00 | 3.04 | 3.74 | 34.4% | 1 Nov 2014 | | 6 | Argentina | 'AR\$268,056 .' | 36,685.00 | 322,362.00 | 48.00 | 14.70 | 129.15 | 1616.7% | 1 September 2024 | | 7 | Armenia | '֏ 75,000 per month.' | 1,787.00 | 4,567.00 | 40.00 | 0.86 | 2.20 | 51.7% | 15 Nov 2022 | | 8 | Australia | 'Most workers are covered by an award, which m... | 35,810.00 | 29,767.00 | 38.00 | 18.12 | 15.06 | 64.7% | 1 July 2024 | | 9 | Austria | 'None: National collective bargaining agreemen... | NaN | NaN | 40.00 | NaN | NaN | | 2017 |

dfa[dfa.Country=="Kenya"]

| | Country | Minimum wage | Annual | | Workweek (hours) | Hourly | | Percent of GDP per capita | Effective per | |-----|---------|---------------------------------------------------|----------------|-------------|------------------|----------------|-------------|---------------------------|---------------| | | | | Nominal (US\$) | PPP (Int\$) | | Nominal (US\$) | PPP (Int\$) | | | | 92 | Kenya | 'Set by the government by location, age and sk... | 702.00 | 1,656.00 | 52.00 | 0.26 | 0.61 | 52.5% | 1 May 2017 |

dfb=dfs3[1]

valsf=[
    ("2018","Nominal","Annual"),("2018","Nominal","Hourly"),
    ("2018","PPP","Annual"),("2018","PPP","Hourly"),
    ("2019","Nominal","Annual"),("2019","Nominal","Hourly"),
    ("2019","PPP","Annual"),("2019","PPP","Hourly"),
    ]
isf=[("2018","Annual workinghours",""),("2019","Annual workinghours",""),]
dfb[valsf]=dfb[valsf].replace({"-":np.nan,})
dfb[isf]=dfb[isf].replace({"-":np.nan,})
dfb = dfb.astype({t:np.float64 for t in valsf})
dfb = dfb.astype({t:np.float64 for t in isf})

# echo:true
dfb.shape
dfb.dtypes
dfb.describe()

(32, 11)

Country                                  object
2018     Nominal              Annual    float64
                              Hourly    float64
         PPP                  Annual    float64
                              Hourly    float64
         Annual workinghours            float64
2019     Nominal              Annual    float64
                              Hourly    float64
         PPP                  Annual    float64
                              Hourly    float64
         Annual workinghours            float64
dtype: object

| | 2018 | | | | | 2019 | | | | | |-------|-----------|--------|-----------|--------|---------------------|-----------|--------|-----------|--------|---------------------| | | Nominal | | PPP | | Annual workinghours | Nominal | | PPP | | Annual workinghours | | | Annual | Hourly | Annual | Hourly | | Annual | Hourly | Annual | Hourly | | | count | 32.00 | 32.00 | 32.00 | 32.00 | 32.00 | 31.00 | 31.00 | 31.00 | 31.00 | 31.00 | | mean | 12,711.91 | 6.15 | 14,978.13 | 7.10 | 2,193.16 | 13,145.38 | 6.18 | 15,738.39 | 7.39 | 2,200.58 | | std | 7,958.94 | 4.10 | 6,367.94 | 3.22 | 281.41 | 8,048.24 | 3.99 | 6,409.05 | 3.29 | 302.70 | | min | 1,236.00 | 0.60 | 2,238.50 | 1.10 | 1,846.00 | 1,386.10 | 0.70 | 2,510.30 | 1.20 | 1,937.00 | | 25% | 5,917.75 | 2.80 | 9,916.52 | 4.90 | 2,079.75 | 6,568.50 | 2.85 | 11,048.90 | 5.35 | 2,079.50 | | 50% | 10,357.55 | 4.95 | 14,218.30 | 6.95 | 2,086.00 | 10,103.70 | 4.50 | 15,080.00 | 6.90 | 2,086.00 | | 75% | 20,644.05 | 10.05 | 20,992.55 | 10.12 | 2,237.25 | 21,163.55 | 10.30 | 21,675.25 | 10.35 | 2,210.50 | | max | 27,601.00 | 13.30 | 25,811.00 | 12.40 | 3,145.00 | 28,073.10 | 13.40 | 26,252.50 | 12.60 | 3,319.00 |

# echo:true
dfb.head(10)

| | Country | 2018 | | | | | 2019 | | | | | |-----|----------------|-----------|--------|-----------|--------|---------------------|-----------|--------|-----------|--------|---------------------| | | | Nominal | | PPP | | Annual workinghours | Nominal | | PPP | | Annual workinghours | | | | Annual | Hourly | Annual | Hourly | | Annual | Hourly | Annual | Hourly | | | 0 | Australia | 25,970.80 | 13.10 | 24,481.20 | 12.40 | 1,976.00 | 26,388.50 | 13.40 | 24,874.90 | 12.60 | 1,976.00 | | 1 | Belgium | 21,293.00 | 10.20 | 22,746.80 | 10.90 | 2,086.00 | 21,410.80 | 10.30 | 22,872.60 | 11.00 | 2,086.00 | | 2 | Canada | 20,552.50 | 9.90 | 20,946.00 | 10.10 | 2,080.00 | 20,880.70 | 10.00 | 21,280.50 | 10.20 | 2,080.00 | | 3 | Chile | 4,902.50 | 2.10 | 7,044.40 | 3.00 | 2,346.00 | 5,101.70 | 2.20 | 7,330.70 | 3.10 | 2,346.00 | | 4 | Colombia | 3,451.30 | 1.20 | 7,677.40 | 2.60 | 2,920.00 | 3,533.70 | 1.20 | 7,860.90 | 2.70 | 2,920.00 | | 5 | Czech Republic | 6,565.80 | 3.30 | 10,789.90 | 5.40 | 2,000.00 | 7,064.30 | 3.50 | 11,609.00 | 5.80 | 2,000.00 | | 6 | Estonia | 6,869.80 | 3.40 | 9,890.20 | 4.90 | 2,020.00 | 7,254.20 | 3.60 | 10,443.50 | 5.20 | 2,019.00 | | 7 | France | 20,989.70 | 13.20 | 21,860.30 | 12.00 | 2,289.00 | 21,889.60 | 11.20 | 21,949.00 | 12.10 | 2,189.00 | | 8 | Germany | 20,414.60 | 10.00 | 23,439.60 | 11.50 | 2,033.00 | 20,916.30 | 10.30 | 24,015.60 | 11.80 | 2,033.00 | | 9 | Greece | 9,208.70 | 3.70 | 13,040.00 | 5.20 | 2,507.00 | 10,103.70 | 4.00 | 14,307.40 | 5.70 | 2,507.00 |

dfb[dfb[("2019","Annual workinghours")]==-1]

| | Country | 2018 | | | | | 2019 | | | | | |-----|---------|-----------|--------|-----------|--------|---------------------|---------|--------|--------|--------|---------------------| | | | Nominal | | PPP | | Annual workinghours | Nominal | | PPP | | Annual workinghours | | | | Annual | Hourly | Annual | Hourly | | Annual | Hourly | Annual | Hourly | | | 13 | Japan | 16,805.50 | 8.10 | 16,607.60 | 8.00 | 2,080.00 | NaN | NaN | NaN | NaN | -1.00 |

Share on

Twitter Facebook LinkedIn

Tralah M Brian

Extracting Structured Data From Wikipedia

Extracting Structured Data from Unstructured Wikipedia Sites.

Testing

GDP (million US$) by country

Top 10

GDP (million US$) by country

Bottom 10

GDP (in USD) per capita by country, territory, non-sovereign state or non-IMF member

Top 10

GDP (in USD) per capita by country, territory, non-sovereign state or non-IMF member

Bottom 10

List of countries by minimum wage

Share on