using DataFrames
using CSV
using XLSX
Julia self study
Self studying Julia
Make a toy dataframe
function grades_2020()
= ["Sally", "Bob", "Alice", "Hank"]
name = [1, 5, 8.5, 4]
grade_2020 DataFrame(; name, grade_2020)
end
grades_2020()
function grades_2021()
= ["Sally", "Bob", "Alice", "Hank", "John"]
name = [3, 5, 8.5, 7, 10]
grade_2021 DataFrame(; name, grade_2021)
end
grades_2021()
5×2 DataFrame
Row | name | grade_2021 |
---|---|---|
String | Float64 | |
1 | Sally | 3.0 |
2 | Bob | 5.0 |
3 | Alice | 8.5 |
4 | Hank | 7.0 |
5 | John | 10.0 |
Loading and writing files
CSV files
Write the dataframe into CSV file
write("toy.csv", grades_2020()) CSV.
"toy.csv"
Read in the CSV file
= CSV.read("toy.csv", DataFrame)
toy first(toy, 5)
4×2 DataFrame
Row | name | grade_2020 |
---|---|---|
String7 | Float64 | |
1 | Sally | 1.0 |
2 | Bob | 5.0 |
3 | Alice | 8.5 |
4 | Hank | 4.0 |
Excel files
function write_xlsx(name, df::DataFrame)
= "$name.xlsx"
path = collect(eachcol(df))
data = names(df)
cols write(path, data, cols)
end
function write_grades_xlsx()
= "grades"
path write_xlsx(path, grades_2020())
"$path.xlsx"
end
= write_grades_xlsx()
path = readxlsx(path) xf
DataFrames.jl
Filter and subset
equals_alice(name::String) = name == "Alice"
filter(:name => equals_alice, grades_2020())
filter(:name => n -> n == "Alice", grades_2020())
filter(:name => ==("Alice"), grades_2020())
filter(:name => !=("Alice"), grades_2020())
3×2 DataFrame
Row | name | grade_2020 |
---|---|---|
String | Float64 | |
1 | Sally | 1.0 |
2 | Bob | 5.0 |
3 | Hank | 4.0 |
function complex_filter(name, grade)::Bool
= startswith(name, 'A') || startswith(name, 'B')
interesting_name = 6 < grade
interesting_grade && interesting_grade
interesting_name end
filter([:name, :grade_2020] => complex_filter, grades_2020())
1×2 DataFrame
Row | name | grade_2020 |
---|---|---|
String | Float64 | |
1 | Alice | 8.5 |
Select
function responses()
= [1, 2]
id = [28, 61]
q1 = [:us, :fr]
q2 = ["F", "B"]
q3 = ["B", "C"]
q4 = ["A", "E"]
q5 DataFrame(; id, q1, q2, q3, q4, q5)
end
responses()
2×6 DataFrame
Row | id | q1 | q2 | q3 | q4 | q5 |
---|---|---|---|---|---|---|
Int64 | Int64 | Symbol | String | String | String | |
1 | 1 | 28 | us | F | B | A |
2 | 2 | 61 | fr | B | C | E |
select(responses(), :id, :q1)
# regex
select(responses(), r"^q")
select(responses(), Not(:q5))
select(responses(), Not([:q4, :q5]))
select(responses(), :q5, Not(:q5))
# renaming
select(responses(), 1 => "participant", :q1 => "age", :q2 => "nationality")
= (1 => "participant", :q1 => "age", :q2 => "nationality")
renames select(responses(), renames...)
2×3 DataFrame
Row | participant | age | nationality |
---|---|---|---|
Int64 | Int64 | Symbol | |
1 | 1 | 28 | us |
2 | 2 | 61 | fr |
Left and right joins
leftjoin(grades_2020(), grades_2021(); on=:name)
rightjoin(grades_2020(), grades_2021(); on=:name)
5×3 DataFrame
Row | name | grade_2020 | grade_2021 |
---|---|---|---|
String | Float64? | Float64 | |
1 | Sally | 1.0 | 3.0 |
2 | Bob | 5.0 | 5.0 |
3 | Alice | 8.5 | 8.5 |
4 | Hank | 4.0 | 7.0 |
5 | John | missing | 10.0 |
Transform variables
plus_one(grades) = grades .+ 1
transform(grades_2020(), :grade_2020 => plus_one)
transform(grades_2020(), :grade_2020 => plus_one => :grade_2020)
transform(grades_2020(), :grade_2020 => plus_one; renamecols=false)
4×2 DataFrame
Row | name | grade_2020 |
---|---|---|
String | Float64 | |
1 | Sally | 2.0 |
2 | Bob | 6.0 |
3 | Alice | 9.5 |
4 | Hank | 5.0 |
= leftjoin(grades_2020(), grades_2021(); on=:name)
leftjoined
function only_pass()
= leftjoin(grades_2020(), grades_2021(); on=:name)
leftjoined pass(A, B) = [5.5 < a || 5.5 < b for (a, b) in zip(A, B)]
= transform(leftjoined, [:grade_2020, :grade_2021] => pass => :pass)
leftjoined = subset(leftjoined, :pass; skipmissing=true)
passed return passed.name
end
only_pass()
2-element Vector{String}:
"Alice"
"Hank"
Groupby and combine
function all_grades()
= grades_2020()
df1 = select(df1, :name, :grade_2020 => :grade)
df1 = grades_2021()
df2 = select(df2, :name, :grade_2021 => :grade)
df2 rename_bob2(data_col) = replace.(data_col, "Bob 2" => "Bob")
= transform(df2, :name => rename_bob2 => :name)
df2 return vcat(df1, df2)
end
all_grades()
9×2 DataFrame
Row | name | grade |
---|---|---|
String | Float64 | |
1 | Sally | 1.0 |
2 | Bob | 5.0 |
3 | Alice | 8.5 |
4 | Hank | 4.0 |
5 | Sally | 3.0 |
6 | Bob | 5.0 |
7 | Alice | 8.5 |
8 | Hank | 7.0 |
9 | John | 10.0 |
groupby(all_grades(), :name)
using Statistics
= groupby(all_grades(), :name)
gdf combine(gdf, :grade => mean)
5×2 DataFrame
Row | name | grade_mean |
---|---|---|
String | Float64 | |
1 | Sally | 2.0 |
2 | Bob | 5.0 |
3 | Alice | 8.5 |
4 | Hank | 5.5 |
5 | John | 10.0 |
= [:A, :A, :B, :B]
group = 1:4
X = 5:8
Y = DataFrame(; group, X, Y)
df
= groupby(df, :group)
gdf combine(gdf, [:X, :Y] .=> mean; renamecols=false)
2×3 DataFrame
Row | group | X | Y |
---|---|---|---|
Symbol | Float64 | Float64 | |
1 | A | 1.5 | 5.5 |
2 | B | 3.5 | 7.5 |
Missing values
= DataFrame(;
df_missing =[missing, "Sally", "Alice", "Hank"],
name=[17, missing, 20, 19],
age=[5.0, 1.0, missing, 4.0],
grade_2020
)
dropmissing(df_missing)
dropmissing(df_missing, :name)
dropmissing(df_missing, [:name, :age])
2×3 DataFrame
Row | name | age | grade_2020 |
---|---|---|---|
String | Int64 | Float64? | |
1 | Alice | 20 | missing |
2 | Hank | 19 | 4.0 |
filter(:name => !ismissing, df_missing)
combine(df_missing, :grade_2020 => mean ∘ skipmissing )
1×1 DataFrame
Row | grade_2020_mean_skipmissing |
---|---|
Float64 | |
1 | 3.33333 |
DataFramesMeta.jl
Similar syntax to dplyr
in R
.
using DataFramesMeta
Select
= all_grades() df
9×2 DataFrame
Row | name | grade |
---|---|---|
String | Float64 | |
1 | Sally | 1.0 |
2 | Bob | 5.0 |
3 | Alice | 8.5 |
4 | Hank | 4.0 |
5 | Sally | 3.0 |
6 | Bob | 5.0 |
7 | Alice | 8.5 |
8 | Hank | 7.0 |
9 | John | 10.0 |
@select df :name
@select df :name :grade
@select df $(Not(:grade))
9×1 DataFrame
Row | name |
---|---|
String | |
1 | Sally |
2 | Bob |
3 | Alice |
4 | Hank |
5 | Sally |
6 | Bob |
7 | Alice |
8 | Hank |
9 | John |
@select df :grade_100 = :grade .* 10
@rselect df :grade_100 = :grade * 10
9×1 DataFrame
Row | grade_100 |
---|---|
Float64 | |
1 | 10.0 |
2 | 50.0 |
3 | 85.0 |
4 | 40.0 |
5 | 30.0 |
6 | 50.0 |
7 | 85.0 |
8 | 70.0 |
9 | 100.0 |
Column transformation
@rtransform df :grade_100 = :grade * 10
9×3 DataFrame
Row | name | grade | grade_100 |
---|---|---|---|
String | Float64 | Float64 | |
1 | Sally | 1.0 | 10.0 |
2 | Bob | 5.0 | 50.0 |
3 | Alice | 8.5 | 85.0 |
4 | Hank | 4.0 | 40.0 |
5 | Sally | 3.0 | 30.0 |
6 | Bob | 5.0 | 50.0 |
7 | Alice | 8.5 | 85.0 |
8 | Hank | 7.0 | 70.0 |
9 | John | 10.0 | 100.0 |
@rtransform df :grade_100 = :grade * 10 :grade_5 = :grade / 2
@rtransform df begin
:grade_100 = :grade * 10
:grade_5 = :grade / 2
end
9×4 DataFrame
Row | name | grade | grade_100 | grade_5 |
---|---|---|---|---|
String | Float64 | Float64 | Float64 | |
1 | Sally | 1.0 | 10.0 | 0.5 |
2 | Bob | 5.0 | 50.0 | 2.5 |
3 | Alice | 8.5 | 85.0 | 4.25 |
4 | Hank | 4.0 | 40.0 | 2.0 |
5 | Sally | 3.0 | 30.0 | 1.5 |
6 | Bob | 5.0 | 50.0 | 2.5 |
7 | Alice | 8.5 | 85.0 | 4.25 |
8 | Hank | 7.0 | 70.0 | 3.5 |
9 | John | 10.0 | 100.0 | 5.0 |
= leftjoin(grades_2020(), grades_2021(); on=:name)
leftjoined @rtransform! leftjoined :grade_2021 = coalesce(:grade_2021, 5)
@rtransform leftjoined :mean_grades = (:grade_2020 + :grade_2021) / 2
4×4 DataFrame
Row | name | grade_2020 | grade_2021 | mean_grades |
---|---|---|---|---|
String | Float64 | Float64 | Float64 | |
1 | Sally | 1.0 | 3.0 | 2.0 |
2 | Bob | 5.0 | 5.0 | 5.0 |
3 | Alice | 8.5 | 8.5 | 8.5 |
4 | Hank | 4.0 | 7.0 | 5.5 |
Row selection
@rsubset df :grade > 7
3×2 DataFrame
Row | name | grade |
---|---|---|
String | Float64 | |
1 | Alice | 8.5 |
2 | Alice | 8.5 |
3 | John | 10.0 |
@subset df :grade .> mean(:grade)
4×2 DataFrame
Row | name | grade |
---|---|---|
String | Float64 | |
1 | Alice | 8.5 |
2 | Alice | 8.5 |
3 | Hank | 7.0 |
4 | John | 10.0 |
@rsubset df begin
:grade > 7
startswith(:name, "A")
end
2×2 DataFrame
Row | name | grade |
---|---|---|
String | Float64 | |
1 | Alice | 8.5 |
2 | Alice | 8.5 |
Row sorting
@orderby leftjoined :grade_2021
@orderby leftjoined -:grade_2021
@orderby leftjoined begin
:grade_2021
:name
end
4×3 DataFrame
Row | name | grade_2020 | grade_2021 |
---|---|---|---|
String | Float64 | Float64 | |
1 | Sally | 1.0 | 3.0 |
2 | Bob | 5.0 | 5.0 |
3 | Hank | 4.0 | 7.0 |
4 | Alice | 8.5 | 8.5 |
Data summaries
@combine leftjoined :mean_grade_2020 = mean(:grade_2020)
1×1 DataFrame
Row | mean_grade_2020 |
---|---|
Float64 | |
1 | 4.625 |
@combine leftjoined begin
:mean_grade_2020 = mean(:grade_2020)
:mean_grade_2021 = mean(:grade_2021)
end
1×2 DataFrame
Row | mean_grade_2020 | mean_grade_2021 |
---|---|---|
Float64 | Float64 | |
1 | 4.625 | 5.875 |
= groupby(leftjoined, :name)
gdf @combine gdf begin
:mean_grade_2020 = mean(:grade_2020)
:mean_grade_2021 = mean(:grade_2021)
end
4×3 DataFrame
Row | name | mean_grade_2020 | mean_grade_2021 |
---|---|---|---|
String | Float64 | Float64 | |
1 | Sally | 1.0 | 3.0 |
2 | Bob | 5.0 | 5.0 |
3 | Alice | 8.5 | 8.5 |
4 | Hank | 4.0 | 7.0 |
Piping operations
@chain leftjoined begin
groupby(:name)
@combine :mean_grade_2020 = mean(:grade_2020)
end
4×2 DataFrame
Row | name | mean_grade_2020 |
---|---|---|
String | Float64 | |
1 | Sally | 1.0 |
2 | Bob | 5.0 |
3 | Alice | 8.5 |
4 | Hank | 4.0 |
@chain leftjoined begin
groupby(:name)
@combine begin
:mean_grade_2020 = mean(:grade_2020)
:mean_grade_2021 = mean(:grade_2021)
end
end
4×3 DataFrame
Row | name | mean_grade_2020 | mean_grade_2021 |
---|---|---|---|
String | Float64 | Float64 | |
1 | Sally | 1.0 | 3.0 |
2 | Bob | 5.0 | 5.0 |
3 | Alice | 8.5 | 8.5 |
4 | Hank | 4.0 | 7.0 |
Finale!
@chain leftjoined begin
@rtransform begin
:grade_2020 = :grade_2020 * 10
:grade_2021 = :grade_2021 * 10
end
groupby(:name)
@combine begin
:mean_grade_2020 = mean(:grade_2020)
:mean_grade_2021 = mean(:grade_2021)
end
@rtransform :mean_grades = (:mean_grade_2020 + :mean_grade_2021) / 2
@rsubset :mean_grades > 50
@orderby -:mean_grades
end
2×4 DataFrame
Row | name | mean_grade_2020 | mean_grade_2021 | mean_grades |
---|---|---|---|---|
String | Float64 | Float64 | Float64 | |
1 | Alice | 85.0 | 85.0 | 85.0 |
2 | Hank | 40.0 | 70.0 | 55.0 |
Quant-econ
Geometric series
# using Pkg
# Pkg.update()
using LinearAlgebra, Statistics
using Distributions, Plots, Random, LaTeXStrings, Symbolics
Example: The Keynesian multiplier
\(y_t = \frac{1-b^{t+1}}{1-b}i \Rightarrow \frac{1}{1-b}i\).
# function that calculates a path of y
function calculate_y(i, b, g, T, y_init)
= zeros(T+1)
y 1] = i + b * y_init + g
y[for t in 2:(T+1)
= b * y[t-1] + i + g
y[t] end
return y
end
# Initial values
= 0.3
i_0 = 0.3
g_0 = 2 / 3
b = 0
y_init = 100
T
= plot(0:T, calculate_y(i_0, b, g_0, T, y_init),
plt = "Path of Aggregate Output Over Time",
title = (0.5, 1.9), xlabel = L"t", ylabel = L"y_t")
ylim
# Output predicted by geometric series
hline!([i_0 / (1-b) + g_0 / (1-b)], linestyle = :dash, seriestype = "hline", legend = false)