-
Notifications
You must be signed in to change notification settings - Fork 0
/
2024_updatedSubsampling.R
66 lines (29 loc) · 2.72 KB
/
2024_updatedSubsampling.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
### example using beetles, repeated on each major dataset
library(dplyr)
library(tidyr)
library(groupdata2)
beetlesAll = read.table(file = "subset_GBIF_coleoptera.csv", header = T, sep = ",")
beetlesSummary = read.table(file = "GBIF_coleoptera_summary.csv", header = T, sep = ",")
beetlesAllPlusSummary = beetlesAll %>% left_join(beetlesSummary)
beetlesAllPlusSummary_20plus = subset(beetlesAllPlusSummary, beetlesAllPlusSummary$nrow >19)
beetlesAllPlusSummary_20plus_ds = downsample(beetlesAllPlusSummary_20plus, cat_col = "acceptedTaxonKey")
write.table(beetlesAllPlusSummary_20plus_ds, "14.iii.2024_beetlesAllPlusSummary_20plus_ds.txt", quote = FALSE, row.names = FALSE, sep = "\t")
beetlesAllPlusSummary_5plus = subset(beetlesAllPlusSummary, beetlesAllPlusSummary$nrow > 4)
beetlesAllPlusSummary_5plus_ds = downsample(beetlesAllPlusSummary_5plus, cat_col = "acceptedTaxonKey")
write.table(beetlesAllPlusSummary_5plus_ds, "14.iii.2024_beetlesAllPlusSummary_5plus_ds.txt", quote = FALSE, row.names = FALSE, sep = "\t")
beetleFamilies20SpeciesCount = as.vector(tapply(beetlesAllPlusSummary_20plus_ds $lat_range, beetlesAllPlusSummary_20plus_ds $family, length))
beetleFamily20List = sort(unique(beetlesAllPlusSummary_20plus_ds $family))
beetleFamilies20DF = data.frame(family = beetleFamily20List, taxonCount = beetleFamilies20SpeciesCount)
head(beetleFamilies20DF)
beetleFamilies20DF_100taxaPlus = subset(beetleFamilies20DF, beetleFamilies20DF$taxonCount > 1999)
write.table(beetleFamilies20DF_100taxaPlus, "14.iii.2024_beetleFamilies20DF_100taxaPlus.txt", quote = FALSE, row.names = FALSE, sep = "\t")
beetleFamilies5SpeciesCount = as.vector(tapply(beetlesAllPlusSummary_5plus_ds $lat_range, beetlesAllPlusSummary_5plus_ds $family, length))
beetleFamily5List = sort(unique(beetlesAllPlusSummary_5plus_ds $family))
beetleFamilies5DF = data.frame(family = beetleFamily5List, taxonCount = beetleFamilies5SpeciesCount)
head(beetleFamilies5DF)
beetleFamilies5DF_100taxaPlus = subset(beetleFamilies5DF, beetleFamilies5DF$taxonCount > 499)
write.table(beetleFamilies5DF_100taxaPlus, "14.iii.2024_beetleFamilies5DF_100taxaPlus.txt", quote = FALSE, row.names = FALSE, sep = "\t")
beetlesAllPlusSummary_20plus_ds_100taxaPlus = beetlesAllPlusSummary_20plus_ds %>% inner_join(beetleFamilies20DF_100taxaPlus)
write.table(beetlesAllPlusSummary_20plus_ds_100taxaPlus, "14.iii.2024_beetlesAllPlusSummary_20plus_ds_100taxaPlus.txt", quote = FALSE, row.names = FALSE, sep = "\t")
beetlesAllPlusSummary_5plus_ds_100taxaPlus = beetlesAllPlusSummary_5plus_ds %>% inner_join(beetleFamilies5DF_100taxaPlus)
write.table(beetlesAllPlusSummary_5plus_ds_100taxaPlus, "14.iii.2024_beetlesAllPlusSummary_5plus_ds_100taxaPlus.txt", quote = FALSE, row.names = FALSE, sep = "\t")