[ad_1]
I am trying to collect some menu information from a website but I am getting a little stuck on extracting correctly the dropdown menu items.
I want the following items:
etc. for each of the drop-down menus on the distritos
page.
However, when we get down to centre badalona
there is no drop-down menu so there is nothing to collect.
For example, the code below can get me the following output:
> collectZonaPageSnapshot %>%
+ html_nodes('.re-GeographicSearchNext-checkboxItem.is-checked.re-GeographicSearchNext-checkboxItem--has-separator')
{xml_nodeset (9)}
[1] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Artigues - Llefià" href="/es/comprar/viviendas/badalona/artigues-llefia/l"><div class="sui-MoleculeCh ...
[2] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Bonavista - Bufalà - Morera" href="/es/comprar/viviendas/badalona/bonavista-bufala-morera/l"><div cla ...
[3] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Canyet - Pomar" href="/es/comprar/viviendas/badalona/canyet-pomar/l"><div class="sui-MoleculeCheckbox ...
[4] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Casagemes - Canyadó" href="/es/comprar/viviendas/badalona/casagemes-canyado/l"><div class="sui-Molecu ...
[5] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Centre Badalona" href="/es/comprar/viviendas/badalona/centre-badalona/l"><div class="sui-MoleculeChec ...
[6] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Gorg - Progrés" href="/es/comprar/viviendas/badalona/gorg-progres/l"><div class="sui-MoleculeCheckbox ...
[7] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Montigalà - Sant Crist" href="/es/comprar/viviendas/badalona/montigala-sant-crist/l"><div class="sui- ...
[8] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Port" href="/es/comprar/viviendas/badalona/port/l"><div class="sui-MoleculeCheckboxField" style=""><d ...
[9] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Salut - Lloreda" href="/es/comprar/viviendas/badalona/salut-lloreda/l"><div class="sui-MoleculeCheckb ...
> collectZonaPageSnapshot %>%
+ html_nodes('.re-GeographicSearchNext-checkboxItem.is-checked')
{xml_nodeset (31)}
[1] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Artigues - Llefià" href="/es/comprar/viviendas/badalona/artigues-llefia/l"><div class="sui-MoleculeC ...
[2] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Artigues" href="/es/comprar/viviendas/badalona/artigues/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeFie ...
[3] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Llefià" href="/es/comprar/viviendas/badalona/llefia/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField-- ...
[4] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Sant Roc" href="/es/comprar/viviendas/badalona/sant-roc/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeFie ...
[5] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Bonavista - Bufalà - Morera" href="/es/comprar/viviendas/badalona/bonavista-bufala-morera/l"><div cl ...
[6] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Bonavista" href="/es/comprar/viviendas/badalona/bonavista/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeF ...
[7] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Bufalà" href="/es/comprar/viviendas/badalona/bufala/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField-- ...
[8] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Morera" href="/es/comprar/viviendas/badalona/morera/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField-- ...
[9] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Canyet - Pomar" href="/es/comprar/viviendas/badalona/canyet-pomar/l"><div class="sui-MoleculeCheckbo ...
[10] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Canyet" href="/es/comprar/viviendas/badalona/canyet/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField-- ...
[11] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Mas Ram" href="/es/comprar/viviendas/badalona/mas-ram/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField ...
[12] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Pomar" href="/es/comprar/viviendas/badalona/pomar/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField--in ...
[13] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Casagemes - Canyadó" href="/es/comprar/viviendas/badalona/casagemes-canyado/l"><div class="sui-Molec ...
[14] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Canyadó" href="/es/comprar/viviendas/badalona/canyado/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField ...
[15] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Casagemes" href="/es/comprar/viviendas/badalona/casagemes/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeF ...
[16] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Manresà" href="/es/comprar/viviendas/badalona/manresa/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField ...
[17] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Centre Badalona" href="/es/comprar/viviendas/badalona/centre-badalona/l"><div class="sui-MoleculeChe ...
[18] <a class="re-GeographicSearchNext-checkboxItem is-checked re-GeographicSearchNext-checkboxItem--has-separator" title="Gorg - Progrés" href="/es/comprar/viviendas/badalona/gorg-progres/l"><div class="sui-MoleculeCheckbo ...
[19] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="Congrés" href="/es/comprar/viviendas/badalona/congres/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeField ...
[20] <a class="re-GeographicSearchNext-checkboxItem is-checked" title="El Remei" href="/es/comprar/viviendas/badalona/el-remei/l"><div class="sui-MoleculeCheckboxField" style=""><div class="sui-MoleculeField sui-MoleculeFie ...
The first part gives me the “parent” menus. The second part gives me the “parent” and “child” menus but I can’t distinguish between the parent and child menus.
Expected output:
To be able to extract the URLs, names etc. with a similar structure to the menu page.
- Artigues - Llefía
-- Artigues
-- Llefía
-- Sant Roc
-Bonavista -Bufalà - Morera
-- Bonavista
-- Bufalà
-- Morera
-Canyet - Pomar
-- Canyet
-- Mas Ram
-- Pomar
etc. (currently I can only get them in a “non-tree” format – i.e. I can’t tell which is the parent menu and which is the child menu)
> collectZonaPageSnapshot %>%
+ html_nodes('.re-GeographicSearchNext-checkboxItem.is-checked') %>%
+ html_text()
[1] "Artigues - Llefià851" "Artigues38" "Llefià714" "Sant Roc99" "Bonavista - Bufalà - Morera233" "Bonavista34"
[7] "Bufalà156" "Morera43" "Canyet - Pomar53" "Canyet6" "Mas Ram29" "Pomar18"
[13] "Casagemes - Canyadó40" "Canyadó9" "Casagemes29" "Manresà2" "Centre Badalona141" "Gorg - Progrés267"
[19] "Congrés32" "El Remei25" "Gorg69" "Progrés - Pep Ventura132" "Montigalà - Sant Crist209" "Montigalà21"
[25] "Puigfred86" "Sant Crist97" "Port79" "Salut - Lloreda592" "La Salut399" "Lloreda133"
[31] "Sistrells60"
Code:
library(RSelenium)
library(rvest)
library(tidyverse)
distrito_url_to_get = "https://www.fotocasa.es/es/comprar/viviendas/badalona/todas-las-zonas/l"
rD <- rsDriver(browser="firefox", port=4536L)
remDr <- rD[["client"]]
remDr$navigate(distrito_url_to_get)
remDr$maxWindowSize()
# click "Accept"
remDr$findElement(using = "xpath",'/html/body/div[1]/div[4]/div/div/div/footer/div/button[2]')$clickElement()
#click on Distrito
remDr$findElement(using = "xpath", '/html/body/div[1]/div[2]/div[1]/div[3]/div/div[1]/div')$clickElement()
# click each of the boxes to "activate the HTML page
#distritoDropDownElements = remDr$findElements(using = 'css selector', '.sui-MoleculeCheckboxField')
distritoDropDownToggleIconElements = remDr$findElements(using = 'css selector', '.sui-MoleculeCheckboxField-toggleIcon')
for(i in 1:length(distritoDropDownElements)){
distritoDropDownElements[[i]]$clickElement()
}
# read in the HTML page
collectZonaPageSnapshot = remDr$getPageSource()[[1]] %>%
read_html()
# part 1) -collect the parent menus
collectZonaPageSnapshot %>%
html_nodes('.re-GeographicSearchNext-checkboxItem.is-checked.re-GeographicSearchNext-checkboxItem--has-separator')
# part 2) -collect the child menus
collectZonaPageSnapshot %>%
html_nodes('.re-GeographicSearchNext-checkboxItem.is-checked')
[ad_2]