Click to view code
::p_load(tidyverse, readtext, skimr, knitr,
pacman
quanteda, tidytext, jsonlite, dplyr,
tidyr, tidygraph, ggraph, igraph, lubridate, visNetwork, ggplot2, gganimate, gridExtra)
Zou Jiaxun
June 7, 2024
In this challenge, I will attempt to do the following:
Rows: 75,817
Columns: 11
$ start_date <chr> "2016-10-29T00:00:00", "2035-06-03T00:00:00", "202…
$ type <chr> "Event.Owns.Shareholdership", "Event.Owns.Sharehol…
$ `_last_edited_by` <chr> "Pelagia Alethea Mordoch", "Niklaus Oberon", "Pela…
$ `_last_edited_date` <chr> "2035-01-01T00:00:00", "2035-07-15T00:00:00", "203…
$ `_date_added` <chr> "2035-01-01T00:00:00", "2035-07-15T00:00:00", "203…
$ `_raw_source` <chr> "Existing Corporate Structure Data", "Oceanus Corp…
$ `_algorithm` <chr> "Automatic Import", "Manual Entry", "Automatic Imp…
$ source <chr> "Avery Inc", "Berger-Hayes", "Bowers Group", "Bowm…
$ target <chr> "Allen, Nichols and Thompson", "Jensen, Morris and…
$ key <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ end_date <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
Rows: 60,520
Columns: 15
$ type <chr> "Entity.Organization.Company", "Entity.Organizatio…
$ country <chr> "Uziland", "Mawalara", "Uzifrica", "Islavaragon", …
$ ProductServices <chr> "Unknown", "Furniture and home accessories", "Food…
$ PointOfContact <chr> "Rebecca Lewis", "Michael Lopez", "Steven Robertso…
$ HeadOfOrg <chr> "Émilie-Susan Benoit", "Honoré Lemoine", "Jules La…
$ founding_date <chr> "1954-04-24T00:00:00", "2009-06-12T00:00:00", "202…
$ revenue <dbl> 5994.73, 71766.67, 0.00, 0.00, 4746.67, 46566.67, …
$ TradeDescription <chr> "Unknown", "Abbott-Gomez is a leading manufacturer…
$ `_last_edited_by` <chr> "Pelagia Alethea Mordoch", "Pelagia Alethea Mordoc…
$ `_last_edited_date` <chr> "2035-01-01T00:00:00", "2035-01-01T00:00:00", "203…
$ `_date_added` <chr> "2035-01-01T00:00:00", "2035-01-01T00:00:00", "203…
$ `_raw_source` <chr> "Existing Corporate Structure Data", "Existing Cor…
$ `_algorithm` <chr> "Automatic Import", "Automatic Import", "Automatic…
$ id <chr> "Abbott, Mcbride and Edwards", "Abbott-Gomez", "Ab…
$ dob <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
# Assume the end_data which is NA be filled by 2035-12-12
mc3_edges_add <- mc3_edges %>%
mutate(
start_date = as.Date(start_date, format = "%Y-%m-%d"),
end_date = as.Date(ifelse(is.na(end_date), "2035-12-12", as.character(end_date)), format = "%Y-%m-%d")
) %>%
filter(!is.na(start_date))
# Display the modified dataset
glimpse(mc3_edges_add)
Rows: 75,727
Columns: 11
$ start_date <date> 2016-10-29, 2035-06-03, 2028-11-20, 2024-09-04, 2…
$ type <chr> "Event.Owns.Shareholdership", "Event.Owns.Sharehol…
$ `_last_edited_by` <chr> "Pelagia Alethea Mordoch", "Niklaus Oberon", "Pela…
$ `_last_edited_date` <chr> "2035-01-01T00:00:00", "2035-07-15T00:00:00", "203…
$ `_date_added` <chr> "2035-01-01T00:00:00", "2035-07-15T00:00:00", "203…
$ `_raw_source` <chr> "Existing Corporate Structure Data", "Oceanus Corp…
$ `_algorithm` <chr> "Automatic Import", "Manual Entry", "Automatic Imp…
$ source <chr> "Avery Inc", "Berger-Hayes", "Bowers Group", "Bowm…
$ target <chr> "Allen, Nichols and Thompson", "Jensen, Morris and…
$ key <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ end_date <date> 2035-12-12, 2035-12-12, 2035-12-12, 2035-12-12, 2…
Rows: 75,727
Columns: 5
$ type <chr> "Event.Owns.Shareholdership", "Event.Owns.Shareholdership",…
$ source <chr> "Avery Inc", "Berger-Hayes", "Bowers Group", "Bowman-Howe",…
$ target <chr> "Allen, Nichols and Thompson", "Jensen, Morris and Downs", …
$ start_date <date> 2016-10-29, 2035-06-03, 2028-11-20, 2024-09-04, 2034-11-12…
$ end_date <date> 2035-12-12, 2035-12-12, 2035-12-12, 2035-12-12, 2035-12-12…
Rows: 60,520
Columns: 3
$ type <chr> "Entity.Organization.Company", "Entity.Organization.Company", …
$ id <chr> "Abbott, Mcbride and Edwards", "Abbott-Gomez", "Abbott-Harriso…
$ country <chr> "Uziland", "Mawalara", "Uzifrica", "Islavaragon", "Oceanus", "…
Name | mc3_edges_filt |
Number of rows | 75727 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
character | 3 |
Date | 2 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
type | 0 | 1 | 14 | 31 | 0 | 4 | 0 |
source | 0 | 1 | 6 | 42 | 0 | 51995 | 0 |
target | 0 | 1 | 6 | 48 | 0 | 8872 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
start_date | 0 | 1 | 1952-05-31 | 2035-12-29 | 2023-12-12 | 11468 |
end_date | 0 | 1 | 2035-01-01 | 2035-12-29 | 2035-12-12 | 73 |
Name | mc3_nodes_filt |
Number of rows | 60520 |
Number of columns | 3 |
_______________________ | |
Column type frequency: | |
character | 3 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
type | 0 | 1 | 13 | 36 | 0 | 8 | 0 |
id | 0 | 1 | 6 | 48 | 0 | 60520 | 0 |
country | 0 | 1 | 4 | 15 | 0 | 86 | 0 |
According to the VAST2024 - MC3 Data Description, the person node include Entity.Person and Entity.Person.CEO, the other were included in the Company.
Based on this plot, the edge date will be used, and the company name to define company; Meanwhile, the type in node will be used, source and target will be choosed to decide the “from” and “to”.
The code chunk below will be used to extract the nodes data.frame of mc3_data and save it as a tibble data.frame called mc3_nodes.
# Clean and select columns from mc3_data$nodes
mc3_nodes_clean <- as_tibble(mc3_nodes_filt) %>%
mutate(type = type,
id = id) %>%
select(id, type) %>%
separate(type, into = c("prefix", "supertype", "subtype"), sep = "\\.", extra = "merge", fill = "right") %>%
select(id, supertype, subtype)
kable(head(mc3_nodes_clean))
id | supertype | subtype |
---|---|---|
Abbott, Mcbride and Edwards | Organization | Company |
Abbott-Gomez | Organization | Company |
Abbott-Harrison | Organization | Company |
Abbott-Ibarra | Organization | Company |
Abbott-Sullivan | Organization | Company |
Acevedo and Sons | Organization | Company |
The code chunk below will be used to extract the links data.frame of mc3_data and save it as a tibble data.frame called mc3_edges.
mc3_edges_clean <- as_tibble(mc3_edges_filt) %>%
mutate(
start_date = as_datetime(start_date),
end_date = as_datetime(end_date)
) %>%
group_by(source, target, type, start_date, end_date) %>%
summarise(weights = n(), .groups = 'drop') %>%
filter(source != target) %>%
ungroup()
kable(head(mc3_edges_clean))
source | target | type | start_date | end_date | weights |
---|---|---|---|---|---|
4. SeaCargo Ges.m.b.H. | Dry CreekRybachit Marine A/S | Event.Owns.Shareholdership | 2034-12-31 | 2035-12-12 | 1 |
4. SeaCargo Ges.m.b.H. | KambalaSea Freight Inc | Event.Owns.Shareholdership | 2033-04-12 | 2035-12-12 | 1 |
9. RiverLine CJSC | SumacAmerica Transport GmbH & Co. KG | Event.Owns.Shareholdership | 2028-12-02 | 2035-12-12 | 1 |
Aaron Acosta | Manning-Pratt | Event.Owns.Shareholdership | 2008-09-14 | 2035-12-12 | 1 |
Aaron Acosta | Manning-Pratt | Event.WorksFor | 2008-07-30 | 2035-12-12 | 1 |
Aaron Allen | Hicks-Calderon | Event.Owns.BeneficialOwnership | 2025-03-06 | 2035-12-12 | 1 |
Let’s focuse on the SouthSeafood Express Corp
# Define the target node
target_node <- "SouthSeafood Express Corp"
# Filter edges related to the target node
related_edges <- mc3_edges_clean %>%
filter(source == target_node | target == target_node) %>%
rename(from = source, to = target)
# Extract the related nodes
related_nodes <- mc3_nodes_clean %>%
filter(id %in% c(related_edges$from, related_edges$to))
# Create the visNetwork plot
visNetwork(related_nodes, related_edges) %>%
visPhysics(solver = "forceAtlas2Based",
forceAtlas2Based = list(gravitationalConstant = -100)) %>%
visIgraphLayout(layout = "layout_with_fr") %>%
visGroups(groupname = "Organization", color = "pink") %>%
visGroups(groupname = "Person", color = "lightgreen") %>%
visLegend() %>%
visEdges(arrows = "to") %>%
visOptions(
highlightNearest = list(enabled = TRUE, degree = 2, hover = TRUE),
nodesIdSelection = TRUE,
selectedBy = "subtype",
collapse = TRUE
) %>%
visInteraction(dragNodes = TRUE, dragView = TRUE, zoomView = TRUE) %>%
visEvents(selectNode = "function(properties) {
var selectedNodeId = properties.nodes[0];
this.body.data.nodes.update({id: selectedNodeId, color: {background: 'red', border: 'black'}});
}")
# Define the target node
target_node <- "SouthSeafood Express Corp"
# Filter edges related to the target node
initial_edges <- mc3_edges_clean %>%
filter(source == target_node | target == target_node) %>%
rename(from = source, to = target)
connected_nodes <- mc3_nodes_clean %>%
filter(id %in% c(initial_edges$from, initial_edges$to))
expanded_edges <- mc3_edges_clean %>%
filter(source %in% connected_nodes$id | target %in% connected_nodes$id) %>%
rename(from = source, to = target)
# Extract source and target nodes from expanded edges
expanded_nodes_source <- expanded_edges %>%
distinct(from) %>%
rename(id = from)
expanded_nodes_target <- expanded_edges %>%
distinct(to) %>%
rename(id = to)
# Combine and deduplicate nodes
expanded_nodes_combined <- bind_rows(expanded_nodes_source, expanded_nodes_target) %>%
distinct(id) %>%
left_join(mc3_nodes_clean, by = "id")
# Assign group based on supertype
expanded_nodes_combined$group <- ifelse(expanded_nodes_combined$supertype == "Organization", "Organization", "Person")
# Create the visNetwork plot
visNetwork(expanded_nodes_combined, expanded_edges) %>%
visPhysics(solver = "forceAtlas2Based",
forceAtlas2Based = list(gravitationalConstant = -100)) %>%
visIgraphLayout(layout = "layout_with_fr") %>%
visGroups(groupname = "Organization", color = "pink") %>%
visGroups(groupname = "Person", color = "lightgreen") %>%
visLegend() %>%
visEdges(arrows = "to") %>%
visOptions(
highlightNearest = list(enabled = TRUE, degree = 2, hover = TRUE),
nodesIdSelection = TRUE,
selectedBy = "subtype",
collapse = TRUE
) %>%
visInteraction(dragNodes = TRUE, dragView = TRUE, zoomView = TRUE) %>%
visEvents(selectNode = "function(properties) {
var selectedNodeId = properties.nodes[0];
this.body.data.nodes.update({id: selectedNodeId, color: {background: 'red', border: 'black'}});
}")
# Identify nodes directly connected to the target node
connected_nodes <- mc3_nodes_clean %>%
filter(id %in% c(initial_edges$from, initial_edges$to))
expanded_edges <- initial_edges
expanded_nodes <- connected_nodes
repeat {
# Identify all nodes connected to the current set of nodes
new_edges <- mc3_edges_clean %>%
filter(source %in% expanded_nodes$id | target %in% expanded_nodes$id) %>%
rename(from = source, to = target)
# Identify new nodes from the newly found edges
new_nodes <- mc3_nodes_clean %>%
filter(id %in% c(new_edges$from, new_edges$to)) %>%
filter(!id %in% expanded_nodes$id)
# Add new edges and nodes to the expanded set
expanded_edges <- bind_rows(expanded_edges, new_edges) %>%
distinct()
expanded_nodes <- bind_rows(expanded_nodes, new_nodes) %>%
distinct()
# Break the loop if there are no new nodes to add or all new nodes are of type "Person"
if (nrow(new_nodes) == 0 || all(new_nodes$supertype == "Person")) {
break
}
}
# Extract source and target nodes from expanded edges
expanded_nodes_source <- expanded_edges %>%
distinct(from) %>%
rename(id = from)
expanded_nodes_target <- expanded_edges %>%
distinct(to) %>%
rename(id = to)
# Combine and deduplicate nodes
expanded_nodes_combined <- bind_rows(expanded_nodes_source, expanded_nodes_target) %>%
distinct(id) %>%
left_join(mc3_nodes_clean, by = "id")
# Assign group based on supertype
expanded_nodes_combined$group <- ifelse(expanded_nodes_combined$supertype == "Organization", "Organization", "Person")
# Create the visNetwork plot
visNetwork(expanded_nodes_combined, expanded_edges) %>%
visPhysics(solver = "forceAtlas2Based",
forceAtlas2Based = list(gravitationalConstant = -100)) %>%
visIgraphLayout(layout = "layout_with_fr") %>%
visGroups(groupname = "Organization", color = "pink") %>%
visGroups(groupname = "Person", color = "lightgreen") %>%
visLegend() %>%
visEdges(arrows = "to") %>%
visOptions(
highlightNearest = list(enabled = TRUE, degree = 2, hover = TRUE),
nodesIdSelection = TRUE,
selectedBy = "subtype",
collapse = TRUE
) %>%
visInteraction(dragNodes = TRUE, dragView = TRUE, zoomView = TRUE) %>%
visEvents(selectNode = "function(properties) {
var selectedNodeId = properties.nodes[0];
this.body.data.nodes.update({id: selectedNodeId, color: {background: 'red', border: 'black'}});
}")
First, create a graph object using tbl_graph() function. Then calculate betweenness and closeness centrality scores.
We will filter nodes with high betweeness centrality scores (>8,000,000) and visualise them to see the relationships that they have.
set.seed(1234)
mc3_graph %>%
filter(betweenness_centrality > 8000000) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha=0.5)) +
geom_node_point(aes(
size = betweenness_centrality,
color = supertype,
alpha = 0.3)) +
geom_node_label(aes(label = id),repel=TRUE, size=2.5, alpha = 0.8) +
scale_size_continuous(range=c(1,10)) +
theme_graph() +
labs(title = 'Initial network visualisation',
subtitle = 'Entities with betweenness scores > 8,000,000')
Below is a dataframe showing us the top 10 entities with the highest betweenness scores.
id | supertype | subtype | betweenness_centrality | closeness_centrality |
---|---|---|---|---|
Sharon Moon | Person | NA | 31170638 | 3.6e-06 |
Steven Robertson | Person | NA | 28158072 | 3.2e-06 |
Johnson, Morales and Castro | Organization | Company | 26218503 | 3.7e-06 |
Hart Ltd | Organization | FinancialCompany | 26129255 | 3.2e-06 |
Abbott-Harrison | Organization | Company | 24206713 | 3.0e-06 |
Phelps, Montoya and Barnett | Organization | Company | 21073981 | 3.0e-06 |
Morrison-Zamora | Organization | Company | 20813895 | 3.7e-06 |
Blankenship-Strickland | Organization | Company | 20157207 | 3.4e-06 |
Holloway-Salas | Organization | Company | 19957178 | 3.7e-06 |
Michael Howard DDS | Person | NA | 19922437 | 3.5e-06 |
The top 10 betweenness entities above include 3 persons and 7 companies. In the next section, we will filter entities into only organization entities. We may revisit the person entities later when we have specific targets/companies to investigate.
# Time bins
bin1 <- mc3_edges_clean %>%
filter(as.Date(start_date) < as.Date("2033-10-29"))
bin2 <- mc3_edges_clean %>%
filter(as.Date(start_date) >= as.Date("2033-10-29") & as.Date(start_date) <= as.Date("2035-05-25"))
bin3 <- mc3_edges_clean %>%
filter(as.Date(start_date) > as.Date("2035-05-25"))
# Filter edges to include any that contain "SouthSeafood"
southseafood_edges_bin1 <- bin1 %>%
filter(str_detect(source, "SouthSeafood") | str_detect(target, "SouthSeafood"))
southseafood_edges_bin2 <- bin2 %>%
filter(str_detect(source, "SouthSeafood") | str_detect(target, "SouthSeafood"))
southseafood_edges_bin3 <- bin3 %>%
filter(str_detect(source, "SouthSeafood") | str_detect(target, "SouthSeafood"))
glimpse(southseafood_edges_bin1)
Rows: 0
Columns: 6
$ source <chr>
$ target <chr>
$ type <chr>
$ start_date <dttm>
$ end_date <dttm>
$ weights <int>
Rows: 2
Columns: 6
$ source <chr> "AguaLeska Transit N.V.", "Tainamarine Fishing Co"
$ target <chr> "SouthSeafood Express Corp", "SouthSeafood Express Corp"
$ type <chr> "Event.Owns.Shareholdership", "Event.Owns.Shareholdership"
$ start_date <dttm> 2033-10-29, 2035-05-25
$ end_date <dttm> 2035-05-25, 2035-12-12
$ weights <int> 1, 1
Rows: 0
Columns: 6
$ source <chr>
$ target <chr>
$ type <chr>
$ start_date <dttm>
$ end_date <dttm>
$ weights <int>
We notice that, there are only 2 companies related to the SouthSeafood Express Corp, they might be shell company.
# Combine and deduplicate nodes
mc3_nodes_combined <- bind_rows(mc3_nodes_source, mc3_nodes_target) %>%
distinct(id) %>%
left_join(mc3_nodes_clean, by = "id")
# Assign group based on supertype
mc3_nodes_combined$group <- ifelse(mc3_nodes_combined$supertype == "Organization", "Organization", "Person")
# Visualize with visNetwork
visNetwork(mc3_nodes_combined, mc3_edges_select_high) %>%
visPhysics(solver = "forceAtlas2Based",
forceAtlas2Based = list(gravitationalConstant = -100)) %>%
visIgraphLayout(layout = "layout_with_fr") %>%
visGroups(groupname = "Organization", color = "yellow") %>%
visGroups(groupname = "Person", color = "grey") %>%
visLegend() %>%
visEdges(arrows = "to") %>%
visOptions(
highlightNearest = list(enabled = TRUE, degree = 2, hover = TRUE),
nodesIdSelection = TRUE,
selectedBy = "subtype",
collapse = TRUE
) %>%
visInteraction(dragNodes = TRUE, dragView = TRUE, zoomView = TRUE) %>%
visEvents(selectNode = "function(properties) {
var selectedNodeId = properties.nodes[0];
this.body.data.nodes.update({id: selectedNodeId, color: {background: 'red', border: 'black'}});
}")
Since the SouthSeafood Express Corp connect first event from AguaLeska Transit N.V. in 2033-10-29, and turn to connect Tainamarine Fishing Co in 2035-05-25
# Filter edges based on date criteria
mc3_edges_filtered_dates <- mc3_edges_clean %>%
filter(start_date < as_datetime("2033-10-29") & end_date > as_datetime("2033-10-29"))
# Visualize with visNetwork
visNetwork(mc3_nodes_combined, mc3_edges_select_high) %>%
visPhysics(solver = "forceAtlas2Based",
forceAtlas2Based = list(gravitationalConstant = -100)) %>%
visIgraphLayout(layout = "layout_with_fr") %>%
visGroups(groupname = "Organization", color = "lightblue") %>%
visGroups(groupname = "Person", color = "lightgreen") %>%
visLegend() %>%
visEdges(arrows = "to") %>%
visOptions(
highlightNearest = list(enabled = TRUE, degree = 2, hover = TRUE),
nodesIdSelection = TRUE,
selectedBy = "subtype",
collapse = TRUE
) %>%
visInteraction(dragNodes = TRUE, dragView = TRUE, zoomView = TRUE) %>%
visEvents(selectNode = "function(properties) {
var selectedNodeId = properties.nodes[0];
this.body.data.nodes.update({id: selectedNodeId, color: {background: 'red', border: 'black'}});
}")
mc3_edges_filtered_dates <- mc3_edges_clean %>%
filter(end_date > as_datetime("2035-05-25"))
# Visualize with visNetwork
visNetwork(mc3_nodes_combined, mc3_edges_select_high) %>%
visPhysics(solver = "forceAtlas2Based",
forceAtlas2Based = list(gravitationalConstant = -100)) %>%
visIgraphLayout(layout = "layout_with_fr") %>%
visGroups(groupname = "Organization", color = "lightblue") %>%
visGroups(groupname = "Person", color = "lightgreen") %>%
visLegend() %>%
visEdges(arrows = "to") %>%
visOptions(
highlightNearest = list(enabled = TRUE, degree = 2, hover = TRUE),
nodesIdSelection = TRUE,
selectedBy = "subtype",
collapse = TRUE
) %>%
visInteraction(dragNodes = TRUE, dragView = TRUE, zoomView = TRUE) %>%
visEvents(selectNode = "function(properties) {
var selectedNodeId = properties.nodes[0];
this.body.data.nodes.update({id: selectedNodeId, color: {background: 'red', border: 'black'}});
}")
We can see from the visNetwork, some person control more than 30 companies at the same time, there will be an obvious control of illegal financial activities.
Exploring the networks between the various type of nodes or players in the space has been useful to visualising the relationships between the different parties. It has yielded interesting insights on how certain companies may influence the around companies.
For future work, the additional column of timeline and financial situation can be used to provide an additional layer of to the overall visualisation of networks and information in this project.