using DataFrames
using CSV
using XLSXJulia self study
Self studying Julia
Make a toy dataframe
function grades_2020()
name = ["Sally", "Bob", "Alice", "Hank"]
grade_2020 = [1, 5, 8.5, 4]
DataFrame(; name, grade_2020)
end
grades_2020()
function grades_2021()
name = ["Sally", "Bob", "Alice", "Hank", "John"]
grade_2021 = [3, 5, 8.5, 7, 10]
DataFrame(; name, grade_2021)
end
grades_2021()5×2 DataFrame
| Row | name | grade_2021 |
|---|---|---|
| String | Float64 | |
| 1 | Sally | 3.0 |
| 2 | Bob | 5.0 |
| 3 | Alice | 8.5 |
| 4 | Hank | 7.0 |
| 5 | John | 10.0 |
Loading and writing files
CSV files
Write the dataframe into CSV file
CSV.write("toy.csv", grades_2020())"toy.csv"
Read in the CSV file
toy = CSV.read("toy.csv", DataFrame)
first(toy, 5)4×2 DataFrame
| Row | name | grade_2020 |
|---|---|---|
| String7 | Float64 | |
| 1 | Sally | 1.0 |
| 2 | Bob | 5.0 |
| 3 | Alice | 8.5 |
| 4 | Hank | 4.0 |
Excel files
function write_xlsx(name, df::DataFrame)
path = "$name.xlsx"
data = collect(eachcol(df))
cols = names(df)
write(path, data, cols)
end
function write_grades_xlsx()
path = "grades"
write_xlsx(path, grades_2020())
"$path.xlsx"
end
path = write_grades_xlsx()
xf = readxlsx(path)DataFrames.jl
Filter and subset
equals_alice(name::String) = name == "Alice"
filter(:name => equals_alice, grades_2020())
filter(:name => n -> n == "Alice", grades_2020())
filter(:name => ==("Alice"), grades_2020())
filter(:name => !=("Alice"), grades_2020())3×2 DataFrame
| Row | name | grade_2020 |
|---|---|---|
| String | Float64 | |
| 1 | Sally | 1.0 |
| 2 | Bob | 5.0 |
| 3 | Hank | 4.0 |
function complex_filter(name, grade)::Bool
interesting_name = startswith(name, 'A') || startswith(name, 'B')
interesting_grade = 6 < grade
interesting_name && interesting_grade
end
filter([:name, :grade_2020] => complex_filter, grades_2020())1×2 DataFrame
| Row | name | grade_2020 |
|---|---|---|
| String | Float64 | |
| 1 | Alice | 8.5 |
Select
function responses()
id = [1, 2]
q1 = [28, 61]
q2 = [:us, :fr]
q3 = ["F", "B"]
q4 = ["B", "C"]
q5 = ["A", "E"]
DataFrame(; id, q1, q2, q3, q4, q5)
end
responses()2×6 DataFrame
| Row | id | q1 | q2 | q3 | q4 | q5 |
|---|---|---|---|---|---|---|
| Int64 | Int64 | Symbol | String | String | String | |
| 1 | 1 | 28 | us | F | B | A |
| 2 | 2 | 61 | fr | B | C | E |
select(responses(), :id, :q1)
# regex
select(responses(), r"^q")
select(responses(), Not(:q5))
select(responses(), Not([:q4, :q5]))
select(responses(), :q5, Not(:q5))
# renaming
select(responses(), 1 => "participant", :q1 => "age", :q2 => "nationality")
renames = (1 => "participant", :q1 => "age", :q2 => "nationality")
select(responses(), renames...)2×3 DataFrame
| Row | participant | age | nationality |
|---|---|---|---|
| Int64 | Int64 | Symbol | |
| 1 | 1 | 28 | us |
| 2 | 2 | 61 | fr |
Left and right joins
leftjoin(grades_2020(), grades_2021(); on=:name)
rightjoin(grades_2020(), grades_2021(); on=:name)5×3 DataFrame
| Row | name | grade_2020 | grade_2021 |
|---|---|---|---|
| String | Float64? | Float64 | |
| 1 | Sally | 1.0 | 3.0 |
| 2 | Bob | 5.0 | 5.0 |
| 3 | Alice | 8.5 | 8.5 |
| 4 | Hank | 4.0 | 7.0 |
| 5 | John | missing | 10.0 |
Transform variables
plus_one(grades) = grades .+ 1
transform(grades_2020(), :grade_2020 => plus_one)
transform(grades_2020(), :grade_2020 => plus_one => :grade_2020)
transform(grades_2020(), :grade_2020 => plus_one; renamecols=false)4×2 DataFrame
| Row | name | grade_2020 |
|---|---|---|
| String | Float64 | |
| 1 | Sally | 2.0 |
| 2 | Bob | 6.0 |
| 3 | Alice | 9.5 |
| 4 | Hank | 5.0 |
leftjoined = leftjoin(grades_2020(), grades_2021(); on=:name)
function only_pass()
leftjoined = leftjoin(grades_2020(), grades_2021(); on=:name)
pass(A, B) = [5.5 < a || 5.5 < b for (a, b) in zip(A, B)]
leftjoined = transform(leftjoined, [:grade_2020, :grade_2021] => pass => :pass)
passed = subset(leftjoined, :pass; skipmissing=true)
return passed.name
end
only_pass()2-element Vector{String}:
"Alice"
"Hank"
Groupby and combine
function all_grades()
df1 = grades_2020()
df1 = select(df1, :name, :grade_2020 => :grade)
df2 = grades_2021()
df2 = select(df2, :name, :grade_2021 => :grade)
rename_bob2(data_col) = replace.(data_col, "Bob 2" => "Bob")
df2 = transform(df2, :name => rename_bob2 => :name)
return vcat(df1, df2)
end
all_grades()9×2 DataFrame
| Row | name | grade |
|---|---|---|
| String | Float64 | |
| 1 | Sally | 1.0 |
| 2 | Bob | 5.0 |
| 3 | Alice | 8.5 |
| 4 | Hank | 4.0 |
| 5 | Sally | 3.0 |
| 6 | Bob | 5.0 |
| 7 | Alice | 8.5 |
| 8 | Hank | 7.0 |
| 9 | John | 10.0 |
groupby(all_grades(), :name)
using Statistics
gdf = groupby(all_grades(), :name)
combine(gdf, :grade => mean)5×2 DataFrame
| Row | name | grade_mean |
|---|---|---|
| String | Float64 | |
| 1 | Sally | 2.0 |
| 2 | Bob | 5.0 |
| 3 | Alice | 8.5 |
| 4 | Hank | 5.5 |
| 5 | John | 10.0 |
group = [:A, :A, :B, :B]
X = 1:4
Y = 5:8
df = DataFrame(; group, X, Y)
gdf = groupby(df, :group)
combine(gdf, [:X, :Y] .=> mean; renamecols=false)2×3 DataFrame
| Row | group | X | Y |
|---|---|---|---|
| Symbol | Float64 | Float64 | |
| 1 | A | 1.5 | 5.5 |
| 2 | B | 3.5 | 7.5 |
Missing values
df_missing = DataFrame(;
name=[missing, "Sally", "Alice", "Hank"],
age=[17, missing, 20, 19],
grade_2020=[5.0, 1.0, missing, 4.0],
)
dropmissing(df_missing)
dropmissing(df_missing, :name)
dropmissing(df_missing, [:name, :age])2×3 DataFrame
| Row | name | age | grade_2020 |
|---|---|---|---|
| String | Int64 | Float64? | |
| 1 | Alice | 20 | missing |
| 2 | Hank | 19 | 4.0 |
filter(:name => !ismissing, df_missing)
combine(df_missing, :grade_2020 => mean ∘ skipmissing )1×1 DataFrame
| Row | grade_2020_mean_skipmissing |
|---|---|
| Float64 | |
| 1 | 3.33333 |
DataFramesMeta.jl
Similar syntax to dplyr in R.
using DataFramesMetaSelect
df = all_grades()9×2 DataFrame
| Row | name | grade |
|---|---|---|
| String | Float64 | |
| 1 | Sally | 1.0 |
| 2 | Bob | 5.0 |
| 3 | Alice | 8.5 |
| 4 | Hank | 4.0 |
| 5 | Sally | 3.0 |
| 6 | Bob | 5.0 |
| 7 | Alice | 8.5 |
| 8 | Hank | 7.0 |
| 9 | John | 10.0 |
@select df :name
@select df :name :grade
@select df $(Not(:grade))9×1 DataFrame
| Row | name |
|---|---|
| String | |
| 1 | Sally |
| 2 | Bob |
| 3 | Alice |
| 4 | Hank |
| 5 | Sally |
| 6 | Bob |
| 7 | Alice |
| 8 | Hank |
| 9 | John |
@select df :grade_100 = :grade .* 10
@rselect df :grade_100 = :grade * 109×1 DataFrame
| Row | grade_100 |
|---|---|
| Float64 | |
| 1 | 10.0 |
| 2 | 50.0 |
| 3 | 85.0 |
| 4 | 40.0 |
| 5 | 30.0 |
| 6 | 50.0 |
| 7 | 85.0 |
| 8 | 70.0 |
| 9 | 100.0 |
Column transformation
@rtransform df :grade_100 = :grade * 109×3 DataFrame
| Row | name | grade | grade_100 |
|---|---|---|---|
| String | Float64 | Float64 | |
| 1 | Sally | 1.0 | 10.0 |
| 2 | Bob | 5.0 | 50.0 |
| 3 | Alice | 8.5 | 85.0 |
| 4 | Hank | 4.0 | 40.0 |
| 5 | Sally | 3.0 | 30.0 |
| 6 | Bob | 5.0 | 50.0 |
| 7 | Alice | 8.5 | 85.0 |
| 8 | Hank | 7.0 | 70.0 |
| 9 | John | 10.0 | 100.0 |
@rtransform df :grade_100 = :grade * 10 :grade_5 = :grade / 2
@rtransform df begin
:grade_100 = :grade * 10
:grade_5 = :grade / 2
end9×4 DataFrame
| Row | name | grade | grade_100 | grade_5 |
|---|---|---|---|---|
| String | Float64 | Float64 | Float64 | |
| 1 | Sally | 1.0 | 10.0 | 0.5 |
| 2 | Bob | 5.0 | 50.0 | 2.5 |
| 3 | Alice | 8.5 | 85.0 | 4.25 |
| 4 | Hank | 4.0 | 40.0 | 2.0 |
| 5 | Sally | 3.0 | 30.0 | 1.5 |
| 6 | Bob | 5.0 | 50.0 | 2.5 |
| 7 | Alice | 8.5 | 85.0 | 4.25 |
| 8 | Hank | 7.0 | 70.0 | 3.5 |
| 9 | John | 10.0 | 100.0 | 5.0 |
leftjoined = leftjoin(grades_2020(), grades_2021(); on=:name)
@rtransform! leftjoined :grade_2021 = coalesce(:grade_2021, 5)
@rtransform leftjoined :mean_grades = (:grade_2020 + :grade_2021) / 24×4 DataFrame
| Row | name | grade_2020 | grade_2021 | mean_grades |
|---|---|---|---|---|
| String | Float64 | Float64 | Float64 | |
| 1 | Sally | 1.0 | 3.0 | 2.0 |
| 2 | Bob | 5.0 | 5.0 | 5.0 |
| 3 | Alice | 8.5 | 8.5 | 8.5 |
| 4 | Hank | 4.0 | 7.0 | 5.5 |
Row selection
@rsubset df :grade > 73×2 DataFrame
| Row | name | grade |
|---|---|---|
| String | Float64 | |
| 1 | Alice | 8.5 |
| 2 | Alice | 8.5 |
| 3 | John | 10.0 |
@subset df :grade .> mean(:grade)4×2 DataFrame
| Row | name | grade |
|---|---|---|
| String | Float64 | |
| 1 | Alice | 8.5 |
| 2 | Alice | 8.5 |
| 3 | Hank | 7.0 |
| 4 | John | 10.0 |
@rsubset df begin
:grade > 7
startswith(:name, "A")
end2×2 DataFrame
| Row | name | grade |
|---|---|---|
| String | Float64 | |
| 1 | Alice | 8.5 |
| 2 | Alice | 8.5 |
Row sorting
@orderby leftjoined :grade_2021
@orderby leftjoined -:grade_2021
@orderby leftjoined begin
:grade_2021
:name
end4×3 DataFrame
| Row | name | grade_2020 | grade_2021 |
|---|---|---|---|
| String | Float64 | Float64 | |
| 1 | Sally | 1.0 | 3.0 |
| 2 | Bob | 5.0 | 5.0 |
| 3 | Hank | 4.0 | 7.0 |
| 4 | Alice | 8.5 | 8.5 |
Data summaries
@combine leftjoined :mean_grade_2020 = mean(:grade_2020)1×1 DataFrame
| Row | mean_grade_2020 |
|---|---|
| Float64 | |
| 1 | 4.625 |
@combine leftjoined begin
:mean_grade_2020 = mean(:grade_2020)
:mean_grade_2021 = mean(:grade_2021)
end1×2 DataFrame
| Row | mean_grade_2020 | mean_grade_2021 |
|---|---|---|
| Float64 | Float64 | |
| 1 | 4.625 | 5.875 |
gdf = groupby(leftjoined, :name)
@combine gdf begin
:mean_grade_2020 = mean(:grade_2020)
:mean_grade_2021 = mean(:grade_2021)
end4×3 DataFrame
| Row | name | mean_grade_2020 | mean_grade_2021 |
|---|---|---|---|
| String | Float64 | Float64 | |
| 1 | Sally | 1.0 | 3.0 |
| 2 | Bob | 5.0 | 5.0 |
| 3 | Alice | 8.5 | 8.5 |
| 4 | Hank | 4.0 | 7.0 |
Piping operations
@chain leftjoined begin
groupby(:name)
@combine :mean_grade_2020 = mean(:grade_2020)
end4×2 DataFrame
| Row | name | mean_grade_2020 |
|---|---|---|
| String | Float64 | |
| 1 | Sally | 1.0 |
| 2 | Bob | 5.0 |
| 3 | Alice | 8.5 |
| 4 | Hank | 4.0 |
@chain leftjoined begin
groupby(:name)
@combine begin
:mean_grade_2020 = mean(:grade_2020)
:mean_grade_2021 = mean(:grade_2021)
end
end4×3 DataFrame
| Row | name | mean_grade_2020 | mean_grade_2021 |
|---|---|---|---|
| String | Float64 | Float64 | |
| 1 | Sally | 1.0 | 3.0 |
| 2 | Bob | 5.0 | 5.0 |
| 3 | Alice | 8.5 | 8.5 |
| 4 | Hank | 4.0 | 7.0 |
Finale!
@chain leftjoined begin
@rtransform begin
:grade_2020 = :grade_2020 * 10
:grade_2021 = :grade_2021 * 10
end
groupby(:name)
@combine begin
:mean_grade_2020 = mean(:grade_2020)
:mean_grade_2021 = mean(:grade_2021)
end
@rtransform :mean_grades = (:mean_grade_2020 + :mean_grade_2021) / 2
@rsubset :mean_grades > 50
@orderby -:mean_grades
end2×4 DataFrame
| Row | name | mean_grade_2020 | mean_grade_2021 | mean_grades |
|---|---|---|---|---|
| String | Float64 | Float64 | Float64 | |
| 1 | Alice | 85.0 | 85.0 | 85.0 |
| 2 | Hank | 40.0 | 70.0 | 55.0 |
Quant-econ
Geometric series
# using Pkg
# Pkg.update()
using LinearAlgebra, Statistics
using Distributions, Plots, Random, LaTeXStrings, SymbolicsExample: The Keynesian multiplier
\(y_t = \frac{1-b^{t+1}}{1-b}i \Rightarrow \frac{1}{1-b}i\).
# function that calculates a path of y
function calculate_y(i, b, g, T, y_init)
y = zeros(T+1)
y[1] = i + b * y_init + g
for t in 2:(T+1)
y[t] = b * y[t-1] + i + g
end
return y
end
# Initial values
i_0 = 0.3
g_0 = 0.3
b = 2 / 3
y_init = 0
T = 100
plt = plot(0:T, calculate_y(i_0, b, g_0, T, y_init),
title = "Path of Aggregate Output Over Time",
ylim = (0.5, 1.9), xlabel = L"t", ylabel = L"y_t")
# Output predicted by geometric series
hline!([i_0 / (1-b) + g_0 / (1-b)], linestyle = :dash, seriestype = "hline", legend = false)