In [1]:
library(tidyverse)
── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.7     ✔ dplyr   1.0.9
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   2.1.2     ✔ forcats 0.5.1

── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

In [2]:
data <- read_csv("Tallo.csv")
Rows: 498838 Columns: 13
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): tree_id, division, family, genus, species, height_outlier, crown_ra...
dbl (6): latitude, longitude, stem_diameter_cm, height_m, crown_radius_m, re...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
In [3]:
c("Simon", "Niklas", sample(c("Mike", "Robin", "Max", "Markus")), "Andreas")
  1. 'Simon'
  2. 'Niklas'
  3. 'Max'
  4. 'Robin'
  5. 'Mike'
  6. 'Markus'
  7. 'Andreas'
In [4]:
data %>% count(species) %>% arrange(-n)
A spec_tbl_df: 5163 × 2
speciesn
<chr><int>
NA 61230
Quercus ilex 22760
Pinus sylvestris 18221
Pinus halepensis 14003
Pinus pinaster 11766
Quercus infectoria 10964
Fagus sylvatica 9470
Pinus nigra 7365
Picea abies 6723
Acer saccharum 5744
Pseudotsuga menziesii 5158
Quercus libani 5119
Quercus faginea 4246
Quercus pubescens 4185
Quercus pyrenaica 4129
Acer rubrum 3545
Quercus suber 3256
Pinus pinea 3165
Pinus wallichiana 2969
Abies balsamea 2947
Picea mariana 2647
Quercus brantii 2619
Picea smithiana 2576
Callitris columellaris 2539
Eucalyptus globulus 2469
Quercus robur 2448
Carpinus betulus 2298
Gymnanthes lucida 2260
Litsea leefeana 1896
Pinus radiata 1878
⋮⋮
Viburnum odoratissimum 1
Vismia latifolia 1
Vitex axillariflora 1
Vitex ciliata 1
Vitex doniana 1
Vitex queenslandica 1
Voacanga thouarsii 1
Vochysia guatemalensis 1
Wendlandia guangdongensis 1
Wikstroemia chui 1
Wikstroemia indica 1
Wikstroemia nutans 1
Wisteria sinensis 1
Xanthophyllum amoenum 1
Xanthophyllum ellipticum 1
Xanthophyllum heterophyllum1
Xanthophyllum macrophyllum 1
Xylia xylocarpa 1
Xylopia amazonica 1
Xylopia rubescens 1
Xylopia staudtii 1
Xylopia villosa 1
Zanha africana 1
Zanha golungensis 1
Zanthoxylum acuminatum 1
Zanthoxylum brachyacanthum 1
Zanthoxylum ovalifolium 1
Zanthoxylum riedelianum 1
Ziziphus calophylla 1
Ziziphus reticulata 1
In [5]:
filtered_data <- data %>% filter(!is.na(species)) %>% add_count(species) %>% filter(n >= 5000) %>% filter(family != "Sapindaceae")
In [6]:
filtered_data %>% ggplot(aes(longitude, latitude, color=family)) + geom_point()
In [7]:
options(repr.plot.width = 12, repr.plot.res=150)
In [8]:
theme_set(theme_light())
In [9]:
filtered_data %>% ggplot(aes(longitude, latitude, color = species)) + geom_point()
In [10]:
filtered_data %>% ggplot(aes(stem_diameter_cm, height_m, color = species)) + geom_point()
Warning message:
“Removed 826 rows containing missing values (geom_point).”
In [11]:
filtered_data %>% filter(species %in% c("Fagus sylvatica", "Quercus ilex", "Picea abies")) %>% ggplot(aes(stem_diameter_cm, height_m, color = species)) + geom_point()
Warning message:
“Removed 567 rows containing missing values (geom_point).”
In [12]:
filtered_species <- filtered_data %>% filter(species %in% c("Fagus sylvatica", "Quercus ilex", "Pinus pinaster"))
In [13]:
filtered_species %>% ggplot(aes(longitude, latitude, color=species)) + geom_point()
In [14]:
filtered_species %>% ggplot(aes(stem_diameter_cm, height_m, color=species)) + geom_point()
Warning message:
“Removed 534 rows containing missing values (geom_point).”
In [15]:
filtered_species %>% ggplot(aes(stem_diameter_cm, height_m, color=crown_radius_outlier)) + geom_point()
Warning message:
“Removed 534 rows containing missing values (geom_point).”
In [16]:
filtered_species %>% ggplot(aes(crown_radius_m, height_m, color=species)) + geom_point()
Warning message:
“Removed 4722 rows containing missing values (geom_point).”
In [17]:
filtered_species %>% ggplot(aes(stem_diameter_cm, height_m, color=crown_radius_outlier)) + geom_point()
Warning message:
“Removed 534 rows containing missing values (geom_point).”
In [18]:
filtered_species %>% group_by(species) %>% summarize(mean(stem_diameter_cm), sd(stem_diameter_cm), var(stem_diameter_cm)) -> hans
In [19]:
filtered_species %>% ggplot(aes(stem_diameter_cm, color = species))+ geom_density()
In [20]:
filtered_species %>% group_by(species) %>% summarize(mean(height_m,na.rm = T), sd(height_m,na.rm = T), var(height_m,na.rm = T)) -> dieter
In [21]:
dieter
A tibble: 3 × 4
speciesmean(height_m, na.rm = T)sd(height_m, na.rm = T)var(height_m, na.rm = T)
<chr><dbl><dbl><dbl>
Fagus sylvatica19.6751268.03817064.612176
Pinus pinaster 10.9391984.66894521.799052
Quercus ilex 6.3276812.469029 6.096105
In [22]:
filtered_species %>% ggplot(aes(height_m, color = species))+ geom_density()
Warning message:
“Removed 534 rows containing non-finite values (stat_density).”
In [23]:
filtered_species %>% ggplot(aes(crown_radius_m, color = species))+ geom_density()
Warning message:
“Removed 4188 rows containing non-finite values (stat_density).”
In [24]:
filtered_species %>%
    pivot_longer(names_to = "bla", values_to = "blub", latitude:crown_radius_m) %>%
    ggplot(aes(blub, color=species)) + geom_density() + facet_wrap(~bla, scale="free")
Warning message:
“Removed 4722 rows containing non-finite values (stat_density).”
In [25]:
filtered_species %>% count(species)
A spec_tbl_df: 3 × 2
speciesn
<chr><int>
Fagus sylvatica 9470
Pinus pinaster 11766
Quercus ilex 22760