Explore HMD data¶

Reads from USA/ folder and writes all.json.

In [1]:
import altair as alt
import pandas as pd
import numpy as np

alt.data_transformers.disable_max_rows()
Out[1]:
DataTransformerRegistry.enable('default')
In [2]:
!ls USA/*
USA/InputDB:
USAbirth.txt        USAdeath.txt        USAref.pdf
USAbirthbymonth.txt USAnote.pdf         USAtadj.txt
USAcom.pdf          USApop.txt          USAwarn.txt

USA/STATS:
Births.txt             Mx_1x5.txt             cMx_1x10.txt
Deaths_1x1.txt         Mx_5x1.txt             cMx_1x5.txt
Deaths_1x10.txt        Mx_5x10.txt            cMx_5x1.txt
Deaths_1x5.txt         Mx_5x5.txt             cMx_5x10.txt
Deaths_5x1.txt         Population.txt         cMx_5x5.txt
Deaths_5x10.txt        Population5.txt        fltper_1x1.csv
Deaths_5x5.txt         bltper_1x1.csv         fltper_1x1.txt
Deaths_lexis.txt       bltper_1x1.txt         fltper_1x10.txt
E0per.txt              bltper_1x10.txt        fltper_1x5.txt
E0per_1x10.txt         bltper_1x5.txt         fltper_5x1.txt
E0per_1x5.txt          bltper_5x1.txt         fltper_5x10.txt
Exposures_1x1.txt      bltper_5x10.txt        fltper_5x5.txt
Exposures_1x10.txt     bltper_5x5.txt         hmd_period_summary.csv
Exposures_1x5.txt      cExposures_1x1.txt     mltper_1x1.csv
Exposures_5x1.txt      cExposures_1x10.txt    mltper_1x1.txt
Exposures_5x10.txt     cExposures_1x5.txt     mltper_1x10.txt
Exposures_5x5.txt      cExposures_5x1.txt     mltper_1x5.txt
Exposures_lexis.txt    cExposures_5x10.txt    mltper_5x1.txt
Mx_1x1.txt             cExposures_5x5.txt     mltper_5x10.txt
Mx_1x10.txt            cMx_1x1.txt            mltper_5x5.txt
In [3]:
!head USA/STATS/Births.txt
The United States of America,  Births (1-year)	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

Year          Female      Male     Total
1933         1122180   1184820   2307000
1934         1166072   1229928   2396000
1935         1158000   1219000   2377000
1936         1148000   1207000   2355000
1937         1175000   1238000   2413000
1938         1217000   1280000   2497000
1939         1201000   1265000   2466000
In [4]:
!head USA/STATS/Deaths_1x1.txt
The United States of America, Deaths (period 1x1), 	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

  Year          Age             Female            Male           Total
  1933           0             52615.77        68438.11       121053.88
  1933           1              8917.13        10329.16        19246.29
  1933           2              4336.92         5140.05         9476.97
  1933           3              3161.59         3759.88         6921.47
  1933           4              2493.84         2932.59         5426.43
  1933           5              2139.87         2537.53         4677.40
  1933           6              1939.70         2337.76         4277.46
In [5]:
!head -n 50 USA/STATS/Deaths_5x10.txt
The United States of America, Deaths (period 5x10), 	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

  Year          Age             Female            Male           Total
1933-1939        0            362056.18       477977.01       840033.19
1933-1939      1-4            115143.59       135489.04       250632.63
1933-1939      5-9             53258.44        67564.33       120822.77
1933-1939     10-14            46694.50        62828.23       109522.73
1933-1939     15-19            81279.71        99734.97       181014.68
1933-1939     20-24           112947.56       129306.90       242254.46
1933-1939     25-29           124815.57       136924.68       261740.25
1933-1939     30-34           129395.90       151004.95       280400.85
1933-1939     35-39           153069.29       190604.31       343673.60
1933-1939     40-44           176225.26       243295.89       419521.15
1933-1939     45-49           213136.49       312163.99       525300.48
1933-1939     50-54           256553.94       382604.02       639157.96
1933-1939     55-59           290219.72       428887.96       719107.68
1933-1939     60-64           352944.85       492107.37       845052.22
1933-1939     65-69           414869.15       541491.24       956360.39
1933-1939     70-74           445865.48       538023.23       983888.71
1933-1939     75-79           438060.37       488250.91       926311.28
1933-1939     80-84           332338.39       333798.86       666137.25
1933-1939     85-89           190727.20       166439.33       357166.53
1933-1939     90-94            72551.42        52714.64       125266.06
1933-1939     95-99            17839.85        11080.82        28920.67
1933-1939    100-104            4683.89         2615.18         7299.07
1933-1939    105-109             682.78          343.77         1026.55
1933-1939    110+                 65.69           27.36           93.05
1940-1949        0            481556.53       647196.64      1128753.17
1940-1949      1-4             96777.46       117620.76       214398.22
1940-1949      5-9             41153.31        58200.03        99353.34
1940-1949     10-14            36212.73        55474.45        91687.18
1940-1949     15-19            67627.86       100348.40       167976.26
1940-1949     20-24            97106.04       140549.51       237655.55
1940-1949     25-29           109934.72       140989.54       250924.26
1940-1949     30-34           130650.19       161566.35       292216.54
1940-1949     35-39           164460.26       212052.48       376512.74
1940-1949     40-44           206538.92       290262.25       496801.17
1940-1949     45-49           267391.42       407085.98       674477.40
1940-1949     50-54           347278.86       565503.79       912782.65
1940-1949     55-59           421471.74       698098.93      1119570.67
1940-1949     60-64           518150.04       805480.61      1323630.65
1940-1949     65-69           641749.80       890759.96      1532509.76
1940-1949     70-74           730003.55       901430.09      1631433.64
1940-1949     75-79           716457.87       793951.81      1510409.68
1940-1949     80-84           593110.08       580661.73      1173771.81
1940-1949     85-89           360719.61       308121.43       668841.04
1940-1949     90-94           135287.52        97819.23       233106.75
1940-1949     95-99            32194.95        20340.94        52535.89
1940-1949    100-104            6930.45         4075.04        11005.49
1940-1949    105-109             771.50          429.90         1201.40
In [6]:
!cat USA/STATS/E0per_1x10.txt
The United States of America, Life expectancy at birth (period, 1x10)	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

  Year       Female    Male     Total
1933-1939     63.42    59.34    61.26
1940-1949     68.19    63.10    65.53
1950-1959     72.35    66.19    69.13
1960-1969     73.87    66.76    70.19
1970-1979     76.12    68.45    72.24
1980-1989     78.14    70.97    74.59
1990-1999     79.15    72.78    76.02
2000-2009     80.08    74.97    77.58
2010-2019     81.22    76.34    78.80
2020-2022     79.85    74.28    77.01
In [7]:
!head USA/STATS/Exposures_1x1.txt
The United States of America, Exposure to risk (period 1x1), 	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

  Year          Age             Female            Male           Total
  1933           0            971181.32      1003854.39      1975035.71
  1933           1           1005773.86      1028926.84      2034700.70
  1933           2           1077549.59      1100519.76      2178069.35
  1933           3           1101794.00      1128155.35      2229949.34
  1933           4           1118327.09      1156106.87      2274433.95
  1933           5           1155442.55      1212990.41      2368432.96
  1933           6           1179393.29      1231712.01      2411105.29
In [8]:
!head USA/STATS/cExposures_1x1.txt
The United States of America, Exposure to risk (cohort 1x1), 	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

  Year          Age             Female            Male           Total
  1852           0                    .               .               .
  1852           1                    .               .               .
  1852           2                    .               .               .
  1852           3                    .               .               .
  1852           4                    .               .               .
  1852           5                    .               .               .
  1852           6                    .               .               .
In [9]:
!head USA/STATS/Mx_1x1.txt
The United States of America, Death rates (period 1x1), 	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

  Year          Age             Female            Male           Total
  1933           0             0.054177        0.068175        0.061292
  1933           1             0.008866        0.010039        0.009459
  1933           2             0.004025        0.004671        0.004351
  1933           3             0.002869        0.003333        0.003104
  1933           4             0.002230        0.002537        0.002386
  1933           5             0.001852        0.002092        0.001975
  1933           6             0.001645        0.001898        0.001774
In [10]:
!head USA/STATS/Mx_5x10.txt
The United States of America, Death rates (period 5x10), 	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

  Year          Age             Female            Male           Total
1933-1939        0             0.053437        0.067995        0.060850
1933-1939      1-4             0.004004        0.004598        0.004304
1933-1939      5-9             0.001343        0.001645        0.001497
1933-1939     10-14            0.001100        0.001452        0.001278
1933-1939     15-19            0.001938        0.002388        0.002162
1933-1939     20-24            0.002807        0.003296        0.003049
1933-1939     25-29            0.003297        0.003721        0.003506
In [11]:
!head USA/STATS/cMx_5x10.txt
The United States of America, Death rates (cohort 5x10), 	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

  Year          Age             Female            Male           Total
1852-1859        0                    .               .               .
1852-1859      1-4                    .               .               .
1852-1859      5-9                    .               .               .
1852-1859     10-14                   .               .               .
1852-1859     15-19                   .               .               .
1852-1859     20-24                   .               .               .
1852-1859     25-29                   .               .               .
In [12]:
!head USA/STATS/bltper_5x10.txt
The United States of America, Life tables (period 5x10), Total	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

  Year          Age         mx       qx    ax      lx      dx      Lx       Tx     ex
1933-1939        0      0.06085  0.05820  0.25  100000    5820   95645  6126226  61.26
1933-1939      1-4      0.00440  0.01741  1.41   94180    1640  372477  6030581  64.03
1933-1939      5-9      0.00150  0.00748  2.30   92540     692  460835  5658104  61.14
1933-1939     10-14     0.00128  0.00637  2.62   91848     585  457850  5197268  56.59
1933-1939     15-19     0.00217  0.01078  2.70   91263     984  454051  4739419  51.93
1933-1939     20-24     0.00305  0.01514  2.58   90280    1367  448084  4285368  47.47
1933-1939     25-29     0.00351  0.01739  2.54   88913    1546  440758  3837284  43.16
In [13]:
!head USA/STATS/fltper_5x10.txt
The United States of America, Life tables (period 5x10), Females	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

  Year          Age         mx       qx    ax      lx      dx      Lx       Tx     ex
1933-1939        0      0.05344  0.05139  0.25  100000    5139   96167  6341625  63.42
1933-1939      1-4      0.00410  0.01621  1.41   94861    1538  375461  6245458  65.84
1933-1939      5-9      0.00135  0.00672  2.27   93323     627  464905  5869997  62.90
1933-1939     10-14     0.00110  0.00548  2.63   92696     508  462279  5405092  58.31
1933-1939     15-19     0.00194  0.00966  2.71   92188     891  458901  4942814  53.62
1933-1939     20-24     0.00281  0.01395  2.59   91298    1274  453416  4483913  49.11
1933-1939     25-29     0.00330  0.01636  2.54   90024    1473  446493  4030497  44.77
In [14]:
!head -n 30 USA/STATS/mltper_5x10.txt
The United States of America, Life tables (period 5x10), Males	Last modified: 05 Sep 2024;  Methods Protocol: v6 (2017)

  Year          Age         mx       qx    ax      lx      dx      Lx       Tx     ex
1933-1939        0      0.06799  0.06470  0.25  100000    6470   95148  5933542  59.34
1933-1939      1-4      0.00470  0.01858  1.42   93530    1738  369630  5838395  62.42
1933-1939      5-9      0.00165  0.00821  2.33   91793     754  456951  5468765  59.58
1933-1939     10-14     0.00145  0.00723  2.62   91039     659  453622  5011813  55.05
1933-1939     15-19     0.00239  0.01190  2.69   90380    1076  449413  4558191  50.43
1933-1939     20-24     0.00330  0.01636  2.56   89304    1461  442964  4108778  46.01
1933-1939     25-29     0.00372  0.01844  2.54   87843    1620  435230  3665815  41.73
1933-1939     30-34     0.00444  0.02194  2.59   86223    1892  426560  3230584  37.47
1933-1939     35-39     0.00596  0.02938  2.60   84331    2477  415721  2804024  33.25
1933-1939     40-44     0.00787  0.03862  2.60   81854    3161  401692  2388303  29.18
1933-1939     45-49     0.01107  0.05391  2.62   78692    4242  383386  1986611  25.25
1933-1939     50-54     0.01582  0.07622  2.60   74450    5675  358629  1603226  21.53
1933-1939     55-59     0.02163  0.10280  2.59   68775    7070  326843  1244597  18.10
1933-1939     60-64     0.03134  0.14572  2.59   61705    8992  286861   917754  14.87
1933-1939     65-69     0.04588  0.20630  2.56   52714   10875  237046   630892  11.97
1933-1939     70-74     0.06813  0.29161  2.53   41839   12201  179086   393847   9.41
1933-1939     75-79     0.10227  0.40591  2.46   29638   12031  117632   214761   7.25
1933-1939     80-84     0.14960  0.53617  2.36   17608    9441   63106    97128   5.52
1933-1939     85-89     0.21888  0.67843  2.20    8167    5541   25314    34023   4.17
1933-1939     90-94     0.28259  0.76224  1.98    2626    2002    7084     8709   3.32
1933-1939     95-99     0.37235  0.85877  1.86     624     536    1440     1625   2.60
1933-1939    100-104    0.46932  0.91699  1.68      88      81     172      185   2.10
1933-1939    105-109    0.56895  0.95216  1.51       7       7      12       13   1.75
1933-1939    110+       0.64856  1.00000  1.54       0       0       1        1   1.54
1940-1949        0      0.04779  0.04599  0.18  100000    4599   96247  6310348  63.10
1940-1949      1-4      0.00225  0.00897  1.52   95401     855  379478  6214100  65.14
1940-1949      5-9      0.00103  0.00512  2.39   94545     484  471462  5834623  61.71

Year: Year or range of years (for both period & cohort data)

Age: Age group for n-year interval from exact age x to just before exact age x+n, where n=1, 4, 5, or ∞ (open age interval)

m(x): Central death rate between ages x and x+n

q(x): Probability of death between ages x and x+n

a(x): Average length of survival between ages x and x+n for persons dying in the interval

l(x): Number of survivors at exact age x, assuming l(0) = 100,000

d(x): Number of deaths between ages x and x+n

L(x): Number of person-years lived between ages x and x+n

T(x): Number of person-years remaining after exact age x

e(x): Life expectancy at exact age x (in years)

In [15]:
def read_life_table(filepath):
    """
    Read a life table file with specific formatting for mortality data.
    
    Parameters:
    filepath (str): Path to the tab-delimited life table file
    
    Returns:
    pandas.DataFrame: Processed life table data with cleaned columns
    str: Metadata from the first line
    """
    # Read the first line separately to get the metadata
    with open(filepath, 'r') as file:
        metadata = file.readline().strip()
    
    # Read the actual data, skipping the first line (metadata)
    # and the empty line between metadata and column headers
    df = pd.read_csv(filepath, 
                     delim_whitespace=True,
                     skiprows=2,         # Skip metadata and empty line
                     na_values=[''],     # Handle empty values
                     engine='python')    # More flexible parsing engine

    # print(df.head())
    
    # Clean column names
    df.columns = df.columns.str.strip()
    
    # print(df.columns)
    
    # Create a mapping for column renames to match the documentation
    column_mapping = {
        'Year': 'Year',
        'Age': 'Age',
        'mx': 'm(x)',         # Central death rate
        'qx': 'q(x)',         # Probability of death
        'ax': 'a(x)',         # Average survival length
        'lx': 'l(x)',         # Number of survivors
        'dx': 'd(x)',         # Number of deaths
        'Lx': 'L(x)',         # Person-years lived
        'Tx': 'T(x)',         # Person-years remaining
        'ex': 'e(x)'          # Life expectancy
    }
    
    # Rename columns
    df = df.rename(columns=column_mapping)
    
    # print(df.head())
    # print(df.columns)
    
    # Convert numeric columns to float
    numeric_columns = ['m(x)', 'q(x)', 'a(x)', 'l(x)', 'd(x)', 'L(x)', 'T(x)', 'e(x)']
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df, metadata

# Example usage:
# df, metadata = read_life_table('USA/STATS/bltper_5x10.txt')
# print("Metadata:", metadata)
# print("\nFirst few rows of the data:")
# print(df.head())

# Example of basic analysis:
def analyze_life_table(df):
    """
    Perform basic analysis on the life table data.
    
    Parameters:
    df (pandas.DataFrame): Processed life table data
    
    Returns:
    dict: Basic statistics and insights
    """
    analysis = {
        'years_covered': df['Year'].unique().tolist(),
        'age_groups': len(df['Age'].unique()),
        'max_life_expectancy': df['e(x)'].max(),
        'min_life_expectancy': df['e(x)'].min(),
        'infant_mortality': df[df['Age'] == '0']['q(x)'].values[0],
        'adult_mortality': df[df['Age'] == '15-19']['q(x)'].values[0]
    }
    
    return analysis
In [16]:
d, meta = read_life_table('USA/STATS/bltper_1x1.txt')
d
Out[16]:
Year Age m(x) q(x) a(x) l(x) d(x) L(x) T(x) e(x)
0 1933 0 0.06129 0.05861 0.25 100000 5861 95624 6089609 60.90
1 1933 1 0.00946 0.00941 0.50 94139 886 93696 5993985 63.67
2 1933 2 0.00435 0.00434 0.50 93253 405 93050 5900289 63.27
3 1933 3 0.00310 0.00310 0.50 92848 288 92704 5807239 62.55
4 1933 4 0.00239 0.00238 0.50 92560 221 92450 5714535 61.74
... ... ... ... ... ... ... ... ... ... ...
9985 2022 106 0.58815 0.45449 0.50 95 43 74 154 1.62
9986 2022 107 0.61915 0.47279 0.50 52 25 40 81 1.55
9987 2022 108 0.64929 0.49016 0.50 27 13 21 41 1.49
9988 2022 109 0.67834 0.50654 0.50 14 7 10 20 1.45
9989 2022 110+ 0.70614 1.00000 1.42 7 7 10 10 1.42

9990 rows × 10 columns

In [17]:
df, _ = read_life_table('USA/STATS/bltper_5x1.txt')
df
Out[17]:
Year Age m(x) q(x) a(x) l(x) d(x) L(x) T(x) e(x)
0 1933 0 0.06129 0.05861 0.25 100000 5861 95624 6089609 60.90
1 1933 1-4 0.00484 0.01911 1.41 94139 1799 371900 5993985 63.67
2 1933 5-9 0.00164 0.00815 2.31 92340 752 459672 5622085 60.88
3 1933 10-14 0.00135 0.00674 2.61 91587 617 456463 5162413 56.37
4 1933 15-19 0.00229 0.01137 2.70 90970 1034 452468 4705950 51.73
... ... ... ... ... ... ... ... ... ... ...
2155 2022 90-94 0.18691 0.62371 2.33 22650 14127 75583 102348 4.52
2156 2022 95-99 0.29637 0.79294 2.07 8523 6758 22803 26765 3.14
2157 2022 100-104 0.43427 0.90435 1.77 1765 1596 3675 3961 2.24
2158 2022 105-109 0.58547 0.95915 1.50 169 162 277 286 1.70
2159 2022 110+ 0.70614 1.00000 1.42 7 7 10 10 1.42

2160 rows × 10 columns

In [18]:
# Function to extract first number from age range for sorting
def extract_first_age(age_str):
    return int(age_str.split('-')[0]) if '-' in str(age_str) else int(age_str.rstrip('+'))

# Sort age groups and create ordered list
age_order = sorted(df['Age'].unique(), key=extract_first_age)
In [19]:
df['x+e(x)'] = df['Age'].apply(extract_first_age) + df['e(x)']
In [20]:
chart = alt.Chart(df).mark_line().encode(
    x=alt.X('Year:O', 
            title='Year',
            axis=alt.Axis(
                labelAngle=-45,
                values=list(range(1935, 2023, 5)),  # Show labels every 5 years from 1950-2022
                labelOverlap=False,
                labelPadding=10
            )),
    y=alt.Y('e(x):Q', 
            title='Remaining Life Expectancy (years)',
            scale=alt.Scale(zero=False)
),
    color=alt.Color('Age:N', 
                   title='Age Group',
                   sort=age_order,
                   scale=alt.Scale(scheme='viridis'),
                   legend=alt.Legend(
                       orient='right',
                       title='Age Group',
                       symbolLimit=50)),
    tooltip=['Year', 'Age', 'e(x)']
).properties(
    width=400,
    height=300,
    title='Remaining Life Expectancy by Age Group Over Time'
)
chart
Out[20]:
In [21]:
chart.save('mortality-improvement-trend-remaining.png', scale_factor=2)
In [22]:
chart = alt.Chart(df).mark_line().encode(
    x=alt.X('Year:O', 
            title='Year',
            axis=alt.Axis(
                labelAngle=-45,
                values=list(range(1935, 2023, 5)),  # Show labels every 5 years from 1950-2022
                labelOverlap=False,
                labelPadding=10
            )),
    y=alt.Y('x+e(x):Q', 
            title='Life Expectancy (years)',
            scale=alt.Scale(zero=False)
),
    color=alt.Color('Age:N', 
                   title='Age Group',
                   sort=age_order,
                   scale=alt.Scale(scheme='viridis'),
                   legend=alt.Legend(
                       orient='right',
                       title='Age Group',
                       symbolLimit=50)),
    tooltip=['Year', 'Age', 'e(x)']
).properties(
    width=400,
    height=300,
    title='Life Expectancy by Age Group Over Time'
)
chart
Out[22]:
In [23]:
chart.save('mortality-improvement-trend.png', scale_factor=2)
In [24]:
def clean_age(age_str):
    """Convert age string to integer, handling the '110+' case"""
    return 110 if age_str == '110+' else int(age_str)
In [25]:
def calculate_survival_distribution(df, current_age, current_year):
    """
    Calculate survival probabilities for all future ages given current age
    
    Parameters:
    df: DataFrame with life table data
    current_age: int, current age of the person
    current_year: int, current year for mortality rates
    
    Returns:
    DataFrame with survival probabilities and corresponding ages
    """
    # Filter data for the specific year
    year_data = df[df['Year'] == current_year].copy()
    
    year_data['Age'] = year_data['Age'].apply(clean_age)
    
    # Start from current age
    future_ages = year_data[year_data['Age'].astype(int) >= current_age].copy()
    
    # Initialize survival probability as 1 at current age
    survival_prob = 1.0
    probabilities = []
    ages = []
    
    # Calculate cumulative survival probability
    for _, row in future_ages.iterrows():
        age = int(row['Age'])
        # Probability of dying at this age
        q = row['q(x)']
        # Probability of surviving to this age (from current age)
        probabilities.append(survival_prob)
        ages.append(age)
        # Update survival probability for next age
        survival_prob *= (1 - q)
    
    # Create result DataFrame
    result = pd.DataFrame({
        'Age': ages,
        'Survival_Probability': probabilities,
        'Death_Probability': np.gradient([-p for p in probabilities], ages)  # Derivative gives death probability density
    })
    
    # Calculate expected statistics
    stats = {
        'mean_age': current_age + year_data[year_data['Age'].astype(int) == current_age]['e(x)'].iloc[0],
        'current_age': current_age,
        'median_age': result[result['Survival_Probability'] <= 0.5]['Age'].iloc[0] if any(result['Survival_Probability'] <= 0.5) else np.nan
    }
    
    return result, stats
In [26]:
def plot_survival_distribution(result_df, stats):
    """
    Create an Altair visualization of the survival distribution
    """
    # Base chart properties
    base = alt.Chart(result_df).properties(
        width=800,
        height=400,
        title=f"Survival and Death Probability Distribution (Current Age: {stats['current_age']})"
    )
    
    # Survival probability line
    survival = base.mark_line(color='blue').encode(
        x=alt.X('Age:Q', title='Age'),
        y=alt.Y('Survival_Probability:Q',
                title='Probability',
                axis=alt.Axis(format='%')),
        tooltip=[
            alt.Tooltip('Age:Q', title='Age'),
            alt.Tooltip('Survival_Probability:Q', title='Survival Probability', format='.1%')
        ]
    )
    
    # Death probability density
    death = base.mark_area(
        color='red',
        opacity=0.3
    ).encode(
        x=alt.X('Age:Q'),
        y=alt.Y('Death_Probability:Q',
                title='Death Probability Density',
                axis=alt.Axis(format='%')),
        tooltip=[
            alt.Tooltip('Age:Q', title='Age'),
            alt.Tooltip('Death_Probability:Q', title='Death Probability Density', format='.2%')
        ]
    )
    
    # Add vertical lines for statistics
    mean_rule = base.mark_rule(color='green').encode(
        x=alt.X('mean_age:Q', title='Age'),
        size=alt.value(2)
    ).transform_calculate(
        mean_age=str(stats['mean_age'])
    )
    
    median_rule = base.mark_rule(color='orange').encode(
        x=alt.X('median_age:Q', title='Age'),
        size=alt.value(2)
    ).transform_calculate(
        median_age=str(stats['median_age'])
    )
    
    # Combine all elements
    chart = (survival + death + mean_rule + median_rule).resolve_scale(
        y='independent'
    ).configure_axis(
        grid=True
    )
    
    return chart
In [27]:
df, _ = read_life_table('USA/STATS/bltper_1x1.txt')
df
Out[27]:
Year Age m(x) q(x) a(x) l(x) d(x) L(x) T(x) e(x)
0 1933 0 0.06129 0.05861 0.25 100000 5861 95624 6089609 60.90
1 1933 1 0.00946 0.00941 0.50 94139 886 93696 5993985 63.67
2 1933 2 0.00435 0.00434 0.50 93253 405 93050 5900289 63.27
3 1933 3 0.00310 0.00310 0.50 92848 288 92704 5807239 62.55
4 1933 4 0.00239 0.00238 0.50 92560 221 92450 5714535 61.74
... ... ... ... ... ... ... ... ... ... ...
9985 2022 106 0.58815 0.45449 0.50 95 43 74 154 1.62
9986 2022 107 0.61915 0.47279 0.50 52 25 40 81 1.55
9987 2022 108 0.64929 0.49016 0.50 27 13 21 41 1.49
9988 2022 109 0.67834 0.50654 0.50 14 7 10 20 1.45
9989 2022 110+ 0.70614 1.00000 1.42 7 7 10 10 1.42

9990 rows × 10 columns

In [28]:
current_age = 35
current_year = 2022
result, stats = calculate_survival_distribution(df, current_age, current_year)
result, stats
Out[28]:
(    Age  Survival_Probability  Death_Probability
 0    35              1.000000           0.002020
 1    36              0.997980           0.002098
 2    37              0.995804           0.002218
 3    38              0.993544           0.002283
 4    39              0.991239           0.002382
 ..  ...                   ...                ...
 71  106              0.000985           0.000603
 72  107              0.000537           0.000351
 73  108              0.000283           0.000196
 74  109              0.000144           0.000106
 75  110              0.000071           0.000073
 
 [76 rows x 3 columns],
 {'mean_age': 79.42, 'current_age': 35, 'median_age': 83})
In [29]:
plot_survival_distribution(result, stats)
Out[29]:
In [30]:
def plot_survival_distribution_labels(result_df, stats, label=False):
    """Create an Altair visualization of the survival distribution with labels"""
    # Base chart properties
    base = alt.Chart(result_df).properties(
        width=800,
        height=400,
        title=f"Survival and Death Probability Distribution (Current Age: {stats['current_age']})"
    )
    
    # Survival probability line
    survival = base.mark_line(color='blue').encode(
        x=alt.X('Age:Q', title='Age'),
        y=alt.Y('Survival_Probability:Q',
                title='Probability',
                axis=alt.Axis(format='%')),
        tooltip=[
            alt.Tooltip('Age:Q', title='Age'),
            alt.Tooltip('Survival_Probability:Q', title='Survival Probability', format='.1%')
        ]
    )
    
    # Add label for survival line
    if label:
        survival_label = base.mark_text(
            align='left',
            baseline='middle',
            dx=5,
            dy=-10,
            color='blue'
        ).encode(
            x=alt.value(50),  # Position label near start of line
            y=alt.value(350),  # Adjust y position as needed
            text=alt.value('Survival Probability')
        )
    
    # Death probability density
    death = base.mark_area(
        color='red',
        opacity=0.3,
    ).encode(
        x=alt.X('Age:Q'),
        y=alt.Y('Death_Probability:Q',
                title='Death Probability Density',
                axis=alt.Axis(format='%')),
        tooltip=[
            alt.Tooltip('Age:Q', title='Age'),
            alt.Tooltip('Death_Probability:Q', title='Death Probability Density', format='.2%')
        ]
    )
    
    # Add label for death probability
    if label:
        death_label = base.mark_text(
            align='left',
            baseline='middle',
            dx=5,
            dy=-10,
            color='red'
        ).encode(
            x=alt.value(50),  # Position label near start of line
            y=alt.value(50),  # Adjust y position as needed
            text=alt.value('Death Probability Density')
        )
    
    # Create labels DataFrame for statistics
    labels_df = pd.DataFrame([
        {'x': stats['mean_age'], 'label': f"Mean Age: {stats['mean_age']:.1f}"},
        {'x': stats['median_age'], 'label': f"Median Age: {stats['median_age']:.1f}"}
    ])
    
    # Add vertical lines with labels for statistics
    mean_rule = base.mark_rule(color='green').encode(
        x=alt.X('mean_age:Q', title='Age'),
        size=alt.value(2)
    ).transform_calculate(
        mean_age=str(stats['mean_age'])
    )
    
    mean_label = alt.Chart(pd.DataFrame([{
        'x': stats['mean_age'],
        'y': 1.0,
        'label': f"Mean Age: {stats['mean_age']:.1f}"
    }])).mark_text(
        align='left',
        baseline='bottom',
        dy=-5,
        color='green'
    ).encode(
        x='x:Q',
        y=alt.value(0),
        text='label:N'
    )
    
    median_rule = base.mark_rule(color='orange').encode(
        x=alt.X('median_age:Q', title='Age'),
        size=alt.value(2)
    ).transform_calculate(
        median_age=str(stats['median_age'])
    )
    
    median_label = alt.Chart(pd.DataFrame([{
        'x': stats['median_age'],
        'y': 1,
        'label': f"Median Age: {stats['median_age']:.1f}"
    }])).mark_text(
        align='left',
        baseline='top',
        dy=5,
        dx=5,
        color='orange'
    ).encode(
        x='x:Q',
        y=alt.value(0),
        text='label:N'
    )
    
    # Combine all elements
    chart = (survival + death + mean_rule + median_rule + 
             mean_label + median_label).resolve_scale(
        y='independent'
    ).configure_axis(
        grid=True
    )
    
    return chart
In [31]:
chart = plot_survival_distribution_labels(result, stats)
chart
Out[31]:
In [32]:
combined, _ = read_life_table('USA/STATS/bltper_1x1.txt')
combined['Gender'] = 'Combined'
male, _ = read_life_table('USA/STATS/mltper_1x1.txt')
male['Gender'] = 'Male'
female, _ = read_life_table('USA/STATS/fltper_1x1.txt')
female['Gender'] = 'Female'
all = pd.concat([combined, male, female])
# fix the age
all['Age'] = all['Age'].apply(clean_age)
In [33]:
all.to_json('all.json', orient='records')
In [ ]: