Julia self study

Self studying Julia

using DataFrames
using CSV
using XLSX

Make a toy dataframe

function grades_2020()
    name = ["Sally", "Bob", "Alice", "Hank"]
    grade_2020 = [1, 5, 8.5, 4]
    DataFrame(; name, grade_2020)
end
grades_2020()

function grades_2021()
    name = ["Sally", "Bob", "Alice", "Hank", "John"]
    grade_2021 = [3, 5, 8.5, 7, 10]
    DataFrame(; name, grade_2021)
end
grades_2021()
5×2 DataFrame
Row name grade_2021
String Float64
1 Sally 3.0
2 Bob 5.0
3 Alice 8.5
4 Hank 7.0
5 John 10.0

Loading and writing files

CSV files

Write the dataframe into CSV file

CSV.write("toy.csv", grades_2020())
"toy.csv"

Read in the CSV file

toy = CSV.read("toy.csv", DataFrame)
first(toy, 5)
4×2 DataFrame
Row name grade_2020
String7 Float64
1 Sally 1.0
2 Bob 5.0
3 Alice 8.5
4 Hank 4.0

Excel files

function write_xlsx(name, df::DataFrame)
    path = "$name.xlsx"
    data = collect(eachcol(df))
    cols = names(df)
    write(path, data, cols)
end

function write_grades_xlsx()
    path = "grades"
    write_xlsx(path, grades_2020())
    "$path.xlsx"
end

path = write_grades_xlsx()
xf = readxlsx(path)

DataFrames.jl

Filter and subset

equals_alice(name::String) = name == "Alice"

filter(:name => equals_alice, grades_2020())

filter(:name => n -> n == "Alice", grades_2020())

filter(:name => ==("Alice"), grades_2020())
filter(:name => !=("Alice"), grades_2020())
3×2 DataFrame
Row name grade_2020
String Float64
1 Sally 1.0
2 Bob 5.0
3 Hank 4.0
function complex_filter(name, grade)::Bool
    interesting_name = startswith(name, 'A') || startswith(name, 'B')
    interesting_grade = 6 < grade
    interesting_name && interesting_grade
end

filter([:name, :grade_2020] => complex_filter, grades_2020())
1×2 DataFrame
Row name grade_2020
String Float64
1 Alice 8.5

Select

function responses()
    id = [1, 2]
    q1 = [28, 61]
    q2 = [:us, :fr]
    q3 = ["F", "B"]
    q4 = ["B", "C"]
    q5 = ["A", "E"]
    DataFrame(; id, q1, q2, q3, q4, q5)
end
responses()
2×6 DataFrame
Row id q1 q2 q3 q4 q5
Int64 Int64 Symbol String String String
1 1 28 us F B A
2 2 61 fr B C E
select(responses(), :id, :q1)

# regex
select(responses(), r"^q")

select(responses(), Not(:q5))
select(responses(), Not([:q4, :q5]))
select(responses(), :q5, Not(:q5))

# renaming
select(responses(), 1 => "participant", :q1 => "age", :q2 => "nationality")
renames = (1 => "participant", :q1 => "age", :q2 => "nationality")
select(responses(), renames...)
2×3 DataFrame
Row participant age nationality
Int64 Int64 Symbol
1 1 28 us
2 2 61 fr

Left and right joins

leftjoin(grades_2020(), grades_2021(); on=:name)

rightjoin(grades_2020(), grades_2021(); on=:name)
5×3 DataFrame
Row name grade_2020 grade_2021
String Float64? Float64
1 Sally 1.0 3.0
2 Bob 5.0 5.0
3 Alice 8.5 8.5
4 Hank 4.0 7.0
5 John missing 10.0

Transform variables

plus_one(grades) = grades .+ 1
transform(grades_2020(), :grade_2020 => plus_one)

transform(grades_2020(), :grade_2020 => plus_one => :grade_2020)
transform(grades_2020(), :grade_2020 => plus_one; renamecols=false)
4×2 DataFrame
Row name grade_2020
String Float64
1 Sally 2.0
2 Bob 6.0
3 Alice 9.5
4 Hank 5.0
leftjoined = leftjoin(grades_2020(), grades_2021(); on=:name)

function only_pass()
    leftjoined = leftjoin(grades_2020(), grades_2021(); on=:name)
    pass(A, B) = [5.5 < a || 5.5 < b for (a, b) in zip(A, B)]
    leftjoined = transform(leftjoined, [:grade_2020, :grade_2021] => pass => :pass)
    passed = subset(leftjoined, :pass; skipmissing=true)
    return passed.name
end
only_pass()
2-element Vector{String}:
 "Alice"
 "Hank"

Groupby and combine

function all_grades()
    df1 = grades_2020()
    df1 = select(df1, :name, :grade_2020 => :grade)
    df2 = grades_2021()
    df2 = select(df2, :name, :grade_2021 => :grade)
    rename_bob2(data_col) = replace.(data_col, "Bob 2" => "Bob")
    df2 = transform(df2, :name => rename_bob2 => :name)
    return vcat(df1, df2)
end
all_grades()
9×2 DataFrame
Row name grade
String Float64
1 Sally 1.0
2 Bob 5.0
3 Alice 8.5
4 Hank 4.0
5 Sally 3.0
6 Bob 5.0
7 Alice 8.5
8 Hank 7.0
9 John 10.0
groupby(all_grades(), :name)

using Statistics

gdf = groupby(all_grades(), :name)
combine(gdf, :grade => mean)
5×2 DataFrame
Row name grade_mean
String Float64
1 Sally 2.0
2 Bob 5.0
3 Alice 8.5
4 Hank 5.5
5 John 10.0
group = [:A, :A, :B, :B]
X = 1:4
Y = 5:8
df = DataFrame(; group, X, Y)

gdf = groupby(df, :group)
combine(gdf, [:X, :Y] .=> mean; renamecols=false)
2×3 DataFrame
Row group X Y
Symbol Float64 Float64
1 A 1.5 5.5
2 B 3.5 7.5

Missing values

df_missing = DataFrame(;
    name=[missing, "Sally", "Alice", "Hank"],
    age=[17, missing, 20, 19],
    grade_2020=[5.0, 1.0, missing, 4.0],
)

dropmissing(df_missing)

dropmissing(df_missing, :name)

dropmissing(df_missing, [:name, :age])
2×3 DataFrame
Row name age grade_2020
String Int64 Float64?
1 Alice 20 missing
2 Hank 19 4.0
filter(:name => !ismissing, df_missing)

combine(df_missing, :grade_2020 => mean  skipmissing )
1×1 DataFrame
Row grade_2020_mean_skipmissing
Float64
1 3.33333

DataFramesMeta.jl

Similar syntax to dplyr in R.

using DataFramesMeta

Select

df = all_grades()
9×2 DataFrame
Row name grade
String Float64
1 Sally 1.0
2 Bob 5.0
3 Alice 8.5
4 Hank 4.0
5 Sally 3.0
6 Bob 5.0
7 Alice 8.5
8 Hank 7.0
9 John 10.0
@select df :name
@select df :name :grade
@select df $(Not(:grade))
9×1 DataFrame
Row name
String
1 Sally
2 Bob
3 Alice
4 Hank
5 Sally
6 Bob
7 Alice
8 Hank
9 John
@select df :grade_100 = :grade .* 10
@rselect df :grade_100 = :grade * 10
9×1 DataFrame
Row grade_100
Float64
1 10.0
2 50.0
3 85.0
4 40.0
5 30.0
6 50.0
7 85.0
8 70.0
9 100.0

Column transformation

@rtransform df :grade_100 = :grade * 10
9×3 DataFrame
Row name grade grade_100
String Float64 Float64
1 Sally 1.0 10.0
2 Bob 5.0 50.0
3 Alice 8.5 85.0
4 Hank 4.0 40.0
5 Sally 3.0 30.0
6 Bob 5.0 50.0
7 Alice 8.5 85.0
8 Hank 7.0 70.0
9 John 10.0 100.0
@rtransform df :grade_100 = :grade * 10 :grade_5 = :grade / 2

@rtransform df begin
    :grade_100 = :grade * 10
    :grade_5 = :grade / 2
end
9×4 DataFrame
Row name grade grade_100 grade_5
String Float64 Float64 Float64
1 Sally 1.0 10.0 0.5
2 Bob 5.0 50.0 2.5
3 Alice 8.5 85.0 4.25
4 Hank 4.0 40.0 2.0
5 Sally 3.0 30.0 1.5
6 Bob 5.0 50.0 2.5
7 Alice 8.5 85.0 4.25
8 Hank 7.0 70.0 3.5
9 John 10.0 100.0 5.0
leftjoined = leftjoin(grades_2020(), grades_2021(); on=:name)
@rtransform! leftjoined :grade_2021 = coalesce(:grade_2021, 5)

@rtransform leftjoined :mean_grades = (:grade_2020 + :grade_2021) / 2
4×4 DataFrame
Row name grade_2020 grade_2021 mean_grades
String Float64 Float64 Float64
1 Sally 1.0 3.0 2.0
2 Bob 5.0 5.0 5.0
3 Alice 8.5 8.5 8.5
4 Hank 4.0 7.0 5.5

Row selection

@rsubset df :grade > 7
3×2 DataFrame
Row name grade
String Float64
1 Alice 8.5
2 Alice 8.5
3 John 10.0
@subset df :grade .> mean(:grade)
4×2 DataFrame
Row name grade
String Float64
1 Alice 8.5
2 Alice 8.5
3 Hank 7.0
4 John 10.0
@rsubset df begin
    :grade > 7
    startswith(:name, "A")
end
2×2 DataFrame
Row name grade
String Float64
1 Alice 8.5
2 Alice 8.5

Row sorting

@orderby leftjoined :grade_2021
@orderby leftjoined -:grade_2021

@orderby leftjoined begin
    :grade_2021
    :name
end
4×3 DataFrame
Row name grade_2020 grade_2021
String Float64 Float64
1 Sally 1.0 3.0
2 Bob 5.0 5.0
3 Hank 4.0 7.0
4 Alice 8.5 8.5

Data summaries

@combine leftjoined :mean_grade_2020 = mean(:grade_2020)
1×1 DataFrame
Row mean_grade_2020
Float64
1 4.625
@combine leftjoined begin
    :mean_grade_2020 = mean(:grade_2020)
    :mean_grade_2021 = mean(:grade_2021)
end
1×2 DataFrame
Row mean_grade_2020 mean_grade_2021
Float64 Float64
1 4.625 5.875
gdf = groupby(leftjoined, :name)
@combine gdf begin
    :mean_grade_2020 = mean(:grade_2020)
    :mean_grade_2021 = mean(:grade_2021)
end
4×3 DataFrame
Row name mean_grade_2020 mean_grade_2021
String Float64 Float64
1 Sally 1.0 3.0
2 Bob 5.0 5.0
3 Alice 8.5 8.5
4 Hank 4.0 7.0

Piping operations

@chain leftjoined begin
    groupby(:name)
    @combine :mean_grade_2020 = mean(:grade_2020)
end
4×2 DataFrame
Row name mean_grade_2020
String Float64
1 Sally 1.0
2 Bob 5.0
3 Alice 8.5
4 Hank 4.0
@chain leftjoined begin
    groupby(:name)
    @combine begin
        :mean_grade_2020 = mean(:grade_2020)
        :mean_grade_2021 = mean(:grade_2021)
    end
end
4×3 DataFrame
Row name mean_grade_2020 mean_grade_2021
String Float64 Float64
1 Sally 1.0 3.0
2 Bob 5.0 5.0
3 Alice 8.5 8.5
4 Hank 4.0 7.0

Finale!

@chain leftjoined begin
    @rtransform begin
        :grade_2020 = :grade_2020 * 10
        :grade_2021 = :grade_2021 * 10
    end
    groupby(:name)
    @combine begin
        :mean_grade_2020 = mean(:grade_2020)
        :mean_grade_2021 = mean(:grade_2021)
    end
    @rtransform :mean_grades = (:mean_grade_2020 + :mean_grade_2021) / 2
    @rsubset :mean_grades > 50
    @orderby -:mean_grades
end
2×4 DataFrame
Row name mean_grade_2020 mean_grade_2021 mean_grades
String Float64 Float64 Float64
1 Alice 85.0 85.0 85.0
2 Hank 40.0 70.0 55.0

Quant-econ

Geometric series

# using Pkg
# Pkg.update()
using LinearAlgebra, Statistics
using Distributions, Plots, Random, LaTeXStrings, Symbolics

Example: The Keynesian multiplier

\(y_t = \frac{1-b^{t+1}}{1-b}i \Rightarrow \frac{1}{1-b}i\).

# function that calculates a path of y
function calculate_y(i, b, g, T, y_init)
    y = zeros(T+1)
    y[1] = i + b * y_init + g
    for t in 2:(T+1)
        y[t] = b * y[t-1] + i + g
    end
    return y
end

# Initial values
i_0 = 0.3
g_0 = 0.3
b = 2 / 3
y_init = 0
T = 100

plt = plot(0:T, calculate_y(i_0, b, g_0, T, y_init),
            title = "Path of Aggregate Output Over Time",
            ylim = (0.5, 1.9), xlabel = L"t", ylabel = L"y_t")

# Output predicted by geometric series
hline!([i_0 / (1-b) + g_0 / (1-b)], linestyle = :dash, seriestype = "hline", legend = false)

Linear algebra