<a href="https://colab.research.google.com/github/icculp/Learning-Bitcoin-from-the-Command-Line/blob/master/Chapter_word_count.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Run in colab by clicking the link above to view the results as a paginated table with word counts for each chapter near the bottom of the notebook. Total word count at the very bottom.

This notebook counts through the translatable words of each chapter, including chapter links; ignores code blocks, markdown characters, and tokens containing digits.

In [1]:
import os
import pandas as pd
import re

In [2]:
!git clone https://github.com/icculp/Learning-Bitcoin-from-the-Command-Line.git

fatal: destination path 'Learning-Bitcoin-from-the-Command-Line' already exists and is not an empty directory.


In [3]:
def count_words():
    """ Counts words ignoring code blocks and digits

        To test for quality:
            lines 14-16 to test a single chapter
            uncomment line 45 to view accepted word tokens
            uncomment line 
    """
    columns=['Chapter', 'Word Count']
    counts = []
    repo_path = '/content/Learning-Bitcoin-from-the-Command-Line/'
    for chapter in os.listdir(repo_path):
        ''' uncomment lines 8-10 to test a single chapter, replacing ch_name
            with the name you want to test
        '''
        #ch_name = '03_2_Knowing_Your_Bitcoin_Setup.md'
        #if chapter != ch_name:
        #    continue
        ignore_list = ['bitcoin.conf-annotated.txt', 'TODO.md', 'TODO-30.md']
        if chapter in ignore_list or\
                not chapter.endswith('md'):  # g
            continue
        count = 0
        flag = 0  # ignores words between code markdown ```
        with open(repo_path + chapter) as ch:
            for line in ch.readlines():
                if flag:
                    if '```' in line[:3].replace(' ', ''):
                        flag = 0
                        continue
                    continue
                if '```' in line:
                    flag = 1
                    continue
                for word in line.split():
                    if '.md' in word:
                        ch_link_tokens = word.split('_')
                        if ']' in word:  # indicates trailing link with chapter name; counts last word of trailing link before chapter name
                            count += 1
                        link_tokens_count = len(ch_link_tokens[2:])  # ignoring chapter numbers in chapter link tokens
                        count += link_tokens_count
                        # print(word, '[TOK]', link_tokens_count, end='[SEPTOK]')
                        continue
                    ignore =  ['*', '**', '#', '##', '###', '####',
                              '-', '—', '>', '`', '/', '&', '|', '~']
                    if any(ch.isdigit() for ch in word) or\
                            word in ignore or\
                            '`' in word or\
                            '~/' in word or\
                            '/.' in word or\
                            '|-' in word or\
                            (word[0] == ':' and word[-1] == ':') or\
                            (word[0] == '"' and word[-1] == '"'):
                        # print(word)  # , end='[SEP]')  # view rejected tokens
                        continue
                    # print(word, count)  # , end='[SEP]')  # view accepted tokens
                    count += 1
        counts.append((chapter, count))
        # print(chapter, count)
    return pd.DataFrame(counts, columns=columns)

In [4]:
chapter_word_counts = count_words()
chapter_word_counts.sort_values(by=['Chapter'], inplace=True)
# view accepted or rejected tokens below if line 53 or 51 uncommented in count_words(), respectively

In [5]:
from google.colab import data_table
data_table.DataTable(chapter_word_counts, include_index=False)

Unnamed: 0,Chapter,Word Count
72,01_0_Introduction.md,1144
21,01_1_Introducing_Bitcoin.md,2735
20,02_0_Setting_Up_a_Bitcoin-Core_VPS.md,226
97,02_1_Setting_Up_a_Bitcoin-Core_VPS_with_StackS...,2723
94,02_2_Setting_Up_Bitcoin_Core_Other.md,254
...,...,...
45,A3_0_Using_Bitcoin_Regtest.md,980
38,CLA.md,495
74,CONTRIBUTING.md,529
53,LICENSE-CC-BY-4.0.md,2716


In [6]:
total_count = chapter_word_counts['Word Count'].sum()
total_count

88215

To convert the table to a markdown format and save as 'Chapter_word_counts.md', run the cells below

In [7]:
from IPython.display import Markdown, display
from tabulate import tabulate


# borrowed from https://stackoverflow.com/questions/33181846/programmatically-convert-pandas-dataframe-to-markdown-table

def pandas_df_to_markdown_table(df):
    fmt = ['---' for i in range(len(df.columns))]
    df_fmt = pd.DataFrame([fmt], columns=df.columns)
    df_formatted = pd.concat([df_fmt, df])
    return Markdown(df_formatted.to_csv(sep="|", index=False))

def df_to_markdown(df, y_index=False):
    blob = tabulate(df, headers='keys', tablefmt='pipe')
    if not y_index:
        return '\n'.join(['| {}'.format(row.split('|', 2)[-1]) for row in blob.split('\n')])
    return blob

In [8]:
mkt = pandas_df_to_markdown_table(chapter_word_counts)

with open('Chapter_word_counts.md', 'w') as m:
    m.write(str(mkt.data))