lab03_webscraping.html 19KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. <!DOCTYPE html>
  2. <html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
  3. <head>
  4. <title>Web scraping</title>
  5. <meta charset="utf-8" />
  6. <meta name="author" content="Maxime Wack" />
  7. <link href="libs/remark-css-0.0.1/default.css" rel="stylesheet" />
  8. <script src="libs/htmlwidgets-1.5.1/htmlwidgets.js"></script>
  9. <script src="libs/jquery-1.12.4/jquery.min.js"></script>
  10. <link href="libs/datatables-css-0.0.0/datatables-crosstalk.css" rel="stylesheet" />
  11. <script src="libs/datatables-binding-0.10/datatables.js"></script>
  12. <link href="libs/dt-core-1.10.19/css/jquery.dataTables.min.css" rel="stylesheet" />
  13. <link href="libs/dt-core-1.10.19/css/jquery.dataTables.extra.css" rel="stylesheet" />
  14. <script src="libs/dt-core-1.10.19/js/jquery.dataTables.min.js"></script>
  15. <link href="libs/crosstalk-1.0.0/css/crosstalk.css" rel="stylesheet" />
  16. <script src="libs/crosstalk-1.0.0/js/crosstalk.min.js"></script>
  17. <link rel="stylesheet" href="css/my_style.css" type="text/css" />
  18. </head>
  19. <body>
  20. <textarea id="source">
  21. class: center, middle, title
  22. # UE Visualisation
  23. ### 2019-2020
  24. ## Dr. Maxime Wack
  25. ### AHU Informatique médicale
  26. #### Hôpital Européen Georges Pompidou, &lt;/br&gt; Université de Paris
  27. ---
  28. # Web scraping
  29. ### Utilisation de `httr` et `rvest`
  30. ## httr
  31. Permet de faire des requêtes réseau
  32. → interroger et télécharger directement depuis R
  33. ## rvest
  34. Extraction de données depuis des pages HTML
  35. ---
  36. # httr
  37. Télécharger une page wikipedia
  38. ```r
  39. GET("https://en.wikipedia.org/wiki/Comparison_of_operating_systems") -&gt; wiki
  40. ```
  41. ```
  42. ## Response [https://en.wikipedia.org/wiki/Comparison_of_operating_systems]
  43. ## Date: 2019-11-18 18:17
  44. ## Status: 200
  45. ## Content-Type: text/html; charset=UTF-8
  46. ## Size: 241 kB
  47. ## &lt;!DOCTYPE html&gt;
  48. ## &lt;html class="client-nojs" lang="en" dir="ltr"&gt;
  49. ## &lt;head&gt;
  50. ## &lt;meta charset="UTF-8"/&gt;
  51. ## &lt;title&gt;Comparison of operating systems - Wikipedia&lt;/title&gt;
  52. ## &lt;script&gt;document.documentElement.className="client-js";RLCONF={"wgBreakFrames...
  53. ## "Articles with unsourced statements from May 2018","Articles with unsourced s...
  54. ## "wgNoticeProject":"wikipedia","wgWikibaseItemId":"Q3345986","wgCentralAuthMob...
  55. ## "ext.gadget.watchlist-notice","ext.gadget.DRN-wizard","ext.gadget.charinsert"...
  56. ## &lt;script&gt;(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.tokens...
  57. ## ...
  58. ```
  59. ---
  60. # Parsing HTML
  61. ```r
  62. wiki %&gt;%
  63. read_html -&gt; wiki_html
  64. ```
  65. ```
  66. ## {html_document}
  67. ## &lt;html class="client-nojs" lang="en" dir="ltr"&gt;
  68. ## [1] &lt;head&gt;\n&lt;meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
  69. ## [2] &lt;body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject ...
  70. ```
  71. ---
  72. # Sélecteurs CSS
  73. [W3Schools](https://www.w3schools.com/cssref/css_selectors.asp)
  74. ### Selecteurs permettant d'identifier un **nœud** précis dans le **DOM** (Document Object Model) d'une page HTML
  75. ### Permet de sélectionner par identifiant, classe, position dans la hiérarchie, position entre élements d'un même niveau, ou relativement entre élements
  76. ### Utiliser l'**inspecteur** des outils de développement du navigateur pour identifier les éléments à capturer
  77. ---
  78. # Sélecteurs CSS
  79. ```r
  80. wiki_html %&gt;%
  81. html_nodes(".wikitable")
  82. ```
  83. ```
  84. ## {xml_nodeset (4)}
  85. ## [1] &lt;table class="wikitable sortable" style="font-size: smaller; text-align: ...
  86. ## [2] &lt;table class="wikitable sortable" style="font-size: smaller; text-align: ...
  87. ## [3] &lt;table class="wikitable" style="font-size: smaller; text-align: center; w ...
  88. ## [4] &lt;table class="wikitable" style="font-size: smaller; text-align: center; w ...
  89. ```
  90. ```r
  91. wiki_html %&gt;%
  92. html_node("div + .wikitable")
  93. ```
  94. ```
  95. ## {html_node}
  96. ## &lt;table class="wikitable sortable" style="font-size: smaller; text-align: center; width: auto; sortby: name;"&gt;
  97. ## [1] &lt;tbody&gt;\n&lt;tr&gt;\n&lt;th&gt;Name\n&lt;/th&gt;\n&lt;th&gt;Creator\n&lt;/th&gt;\n&lt;th abbr="Initial rel ...
  98. ```
  99. ---
  100. # Extraction d'une table
  101. ```r
  102. wiki_html %&gt;%
  103. html_node("div + .wikitable") %&gt;%
  104. html_table -&gt; wikitable
  105. ```
  106. <div id="htmlwidget-091df5e78bc68f820c56" style="width:100%;height:auto;" class="datatables html-widget"></div>
  107. <script type="application/json" data-for="htmlwidget-091df5e78bc68f820c56">{"x":{"filter":"none","data":[["AIX","Android","AmigaOS classic","AmigaOS 4","Chrome OS","DragonFly BSD","FreeBSD","Genode","GhostBSD","eComStation","EPOC32","GNU/Linux","Haiku","HP-UX","IBM i","Inferno","iOS","IRIX","Classic Mac OS","MVS","macOS","macOS Server","MPE","MCP","MPE/XL","MINIX 3","NetBSD","NetWare","NeXTSTEP","OpenBSD","OpenIndiana","OpenVMS","OS/360","OS/390","OS 2200","OS/2","Plan 9","QNX","Solaris","Symbian","Symbian platform","Windows Server (NT family)","Windows (NT family)","Windows (classic 9x family)","RISC iX","RISC OS","RISC OS 4","RISC OS 5","RISC OS 6","ZETA","STOP 6, XTS-400","ReactOS","TrueOS","VxWorks","z/OS","z/VSE","z/VM","HP NonStop","Name"],["IBM","Android, Inc., Google","Commodore International, Haage &amp; Partner, Hyperion Entertainment","Hyperion Entertainment","Google","Matthew Dillon","The FreeBSD Project","Genode Labs","Eric Turgeon","Serenity Systems, Mensys BV","Psion PLC","Notable contributors include: Richard Stallman for GNU Project and Linus Torvalds for Linux and the Unixes they emulated; Red Hat, Debian Project See: Comparison of Linux distributions and Linux Kernel#Development","Haiku Inc.","Hewlett-Packard","IBM","Bell Labs","Apple Inc.","SGI","Apple Inc.","IBM","Apple Inc.","Apple Inc.","HP","Unisys","HP","Andrew S. Tanenbaum","The NetBSD Project","Novell","NeXT","The OpenBSD Project","Many, based on software developed by Sun Microsystems and many others","DEC (now VSI)","IBM","IBM","Unisys","IBM and Microsoft","Bell Labs","QNX Software Systems","Sun, Oracle Corporation","Symbian Ltd.","Symbian Foundation","Microsoft","Microsoft","Microsoft","Acorn Computers","Acorn Computers","RISCOS Ltd, Pace plc","Castle Technology, RISC OS Open","RISCOS Ltd","yellowTAB","BAE Systems","ReactOS development team","PC-BSD Software","Wind River Systems","IBM","IBM","IBM","HP","Creator"],["1986","2008","1985","2004","2009","2003","1993","2008","2009","2001","1996","1991 (kernel), See: Comparison of Linux distributions and History of Linux","2009","1983","1988","1997","2007","1988","1984","1972","2001","2001","1974","1961","1987","2005","1993","1985","1989","1995","2010","1977","1966","1995","1967 as Exec 8e","1987","1993","1982","1992","1998","2010 (initially 1998 as Symbian)","1993","1993","1995","1988","1987","1999","2002","2006","2005","2003","1996","2006","1987","2000","2007","2000","1974","Initial public release"],["UNIX System V Release 3","None","TRIPOS (as the disk operating component of AmigaOS)","AmigaOS classic","Chromium OS","FreeBSD","386BSD","None","FreeBSD","OS/2","","None","BeOS R5","UNIX System V","OS/400","Plan 9","macOS","UNIX System V","None[g 2][g 3]","OS/360","NeXTSTEP, BSD","NeXTSTEP, BSD","None","None","MPE","Minix2","386BSD","S-Net","Unix","NetBSD 1.0","OpenSolaris","RSX-11M","None","MVS","Exec 8, OS 1100","MS-DOS","Unix","Unix, POSIX","SunOS","EPOC32","Symbian","OS/2, Windows 3.x and MS-DOS","OS/2, Windows 9x and MS-DOS","MS-DOS, Windows NT 3.5","BSD 4.3","Arthur, also the BBC Master OS","RISC OS","RISC OS 4","RISC OS 4","BeOS R5","STOP 5, XTS-300","Windows NT","FreeBSD[g 5]","VRTX","OS/390","VSE/ESA","VM","Guardian","Predecessor"],["7.2","10","3.9 BB2","4.1 Final Edition","75.0.3770.102","5.6.1","12.0","19.05[1]","19.04[2]","2.1","ER5","4.19 (kernel)","R1/Beta","11i v3 Update 16","7.3","Fourth Edition","13.1","6.5.30","9.2.2","MVS/ESA SP - JES3 Version 5 R2.2","10.14.6 Mojave","10.12 / September 20, 2016","MPE-V","CP OS 17.0","7.5","3.3.0","8.1","6.5 SP8","3.3","6.5","2019.04","8.4-2L2","Operating System/360 R21.8","OS/390 Version 2 R10","CP OS 16 (Exec 49.2)","4.52","Fourth Edition","7.0.0","11.4","9.5","3.0.4","Windows Server 2019 (Version 10.0.17763)","Windows 10 (Version 1903)","Windows Me (Win 4.90.3000)","1.21c","3.71","4.39","5.24[6]","6.20","1.5","6.5","0.4.10","10.1[7]","7","Version 2.3 (V2R3)","6.2","6.4","H06.24/J06.13","Current stable version"],["2015, October 5","2019, September 3","2002, March 20","2014","2019, March 5","2019, June 19","2018, December 11","2019, May 29","2019, April 13","2011","1999","2018, October 22 (kernel)","2018, September 29","2017, May","2016, April 15","2009, June 30","2019, September 24","2006","2000","1995, September 29","2019, August 26","2016, September 20","1988","2015, April","2002","2014","2019, May 31","2009, May 6","1995","2019, May 1","2017, May 3[3]","2017, July 10","1972, August","2000, September 29","2015, February 27","2001","2003 (except for minor later updates)","2017, March","2018, August 28","2009","2010","2018, October 2","2019, May 21","2000","1993","1997","2004","2018","2009","2007","2008, August","2018, November 6","2014, November 16","2014 March","2017, September 29","2017, December 1","2016, November 11","2012","Release date"],["Bundled with hardware","Free","Discontinued; Bundled with hardware up to version 3.0 (Amiga International hardware came with 3.1); versions 2.1, 3.0, 3.1, 3.5, 3.9 also available as separate packages","4.0 bundled with hardware; 4.0 for classic and 4.1 available as standalone package at €29","Bundled with hardware, 32-bit edition dropped","Free","Free","Free, source code only","Free","Home-student edition (max. three per site) US$145.00 (equivalent to $161.49 in 2018)business edition $290.00","Discontinued; Commercial","Free","Free","US$400","Bundled with hardware","Free","Bundled with hardware and free updates given to most existing users, subject to hardware requirements","Discontinued; Bundled with hardware","Discontinued; Was bundled with 68k and PowerPC Macs;\nversions 7-9 sold as retail upgrades[g 4]","Bundled with hardware","Bundled with hardware; No-cost update via Mac App Store for users of Mac OS X 10.6 or later, assuming hardware requirements are met","Previously bundled with hardware; No longer a separate operating system, but a group of services installed atop any current version of Mac OS X; US$19.99 on the Mac App Store","Discontinued; Was bundled with HP-3000 CISC hardware \"Classic\"","Bundled with hardware","Discontinued; Was bundled with HP-3000 PA-RISC hardware","Free","Free","Superseded by Novell Open Enterprise Server; Was US$184 (equivalent to $214.88 in 2018) (one-user)","Discontinued; Was bundled with hardware, then sold separately","Free","Free","Commercial, free non-commercial use","Bundled with hardware","Bundled with hardware","Bundled with hardware","Discontinued (see eComStation successor); Was US$300 (equivalent to $424.49 in 2018)","Free","Bundled with BlackBerry 10 and PlayBook devices. Commercial; an academic version exists that needs authorization code before installing","Commercial; (but free/no-cost perpetual license when used \"for the purpose of developing, testing, prototyping and demonstrating your applications\"[4])","Discontinued; Commercial","Free","US$1050 5 CALs server; other editions dependent on number of CALs purchased","Windows 10 Home US$119, Windows 10 Pro US$199[5]","Discontinued","Discontinued; Was bundled with hardware","Discontinued; Was bundled with hardware","Bundled with hardware, then sold separately at £70 (US$127)","Free","Bundled with hardware, then sold separately at £70 (US$127)","Discontinued","US$60,000 (equivalent to $69,821 in 2018)+; bundled with XTS hardware and OEM licensed","Free","Free","Paid","Monthly license fee, about US$130 and up","Monthly license fee","Monthly license fee","Non-free","Cost, availability"],["Proprietary","Apache 2.0, GNU GPLv2","Proprietary, open source clone available under AROS Public License","Proprietary","Proprietary: Google OS Terms of Service","BSD","BSD","AGPL","BSD","Proprietary","Proprietary","GNU GPLv2 (kernel)","MIT","Proprietary","Proprietary","MIT, GNU GPL, GNU LGPL, LPL","Proprietary higher level API layers; open source core system (ARM versions): APSL, GNU GPL, others","Proprietary","Proprietary","Proprietary","Proprietary higher level API layers; open source core system (Intel-PowerPC versions): APSL, GNU GPL, others","Proprietary higher level API layers; open source core system (Intel-PowerPC versions): APSL, GNU GPL, others","Proprietary","Proprietary","Proprietary","BSD","BSD","Proprietary","Proprietary","ISC","CDDL","Proprietary","Proprietary","Proprietary","Proprietary","Proprietary","LPL","Proprietary","CDDL","Proprietary","EPL","Proprietary; Source-available","Proprietary; Source-available","Proprietary","Proprietary","Proprietary","Proprietary","Apache License","Proprietary","Proprietary","Proprietary","GNU GPL, GNU LGPL","BSD","Proprietary","Proprietary","Proprietary","Proprietary","Proprietary","Preferred license[g 1]"],["Server, NAS, workstation","Smartphone, tablet computer, education","Workstation, personal computer","Workstation, personal computer","Chromebook, Chromebox, Chromebase and tablets","Server, workstation, NAS, embedded","Server, workstation, NAS, embedded","Desktop, Embedded, Server","Desktop, workstation","Server, workstation, personal computer","PDA","See: Comparison of Linux distributions","Personal computer","Server","Server","NAS, server, embedded","Smartphone, music player, tablet computer","Server, workstation","Workstation, personal computer","IBM mainframe","Workstation, personal computer, embedded","Server","Server","Server","Server","Workstation","NAS, server, workstation, embedded","Server","Workstation","Server, NAS, workstation, embedded","Server, workstation","Server, workstation","IBM mainframe","IBM mainframe","Server","Personal computer, server","Workstation, server, embedded, HPC","Automotive, medical, smartphones, consumer, industrial, embedded, safety","Server, workstation","Phones","embedded","Server, NAS, embedded","Workstation, personal computer, media center, Tablet PC, embedded","Personal computer, media center","Workstation","Education, personal computer","Education, personal computer","Education, personal computer","Education, personal computer","Personal computer, media center, workstation","Server, workstation","Workstation, personal computer","Personal computer, workstation, server","Embedded Real-time systems","IBM mainframe","IBM mainframe","IBM mainframe","HP Nonstop Servers","Target system type"]],"container":"<table class=\"display\">\n <thead>\n <tr>\n <th>Name<\/th>\n <th>Creator<\/th>\n <th>Initial public release<\/th>\n <th>Predecessor<\/th>\n <th>Current stable version<\/th>\n <th>Release date<\/th>\n <th>Cost, availability<\/th>\n <th>Preferred license[g 1]<\/th>\n <th>Target system type<\/th>\n <\/tr>\n <\/thead>\n<\/table>","options":{"paging":false,"info":false,"searching":false,"order":[],"autoWidth":false,"orderClasses":false}},"evals":[],"jsHooks":[]}</script>
  108. ---
  109. # Exercices
  110. ### Transformer cette table en forme normale
  111. ### Extraire la table avec les informations techniques
  112. ### Identifier les OS libres fonctionnant avec un microkernel
  113. </textarea>
  114. <style data-target="print-only">@media screen {.remark-slide-container{display:block;}.remark-slide-scaler{box-shadow:none;}}</style>
  115. <script src="https://remarkjs.com/downloads/remark-latest.min.js"></script>
  116. <script src="addons/macros.js"></script>
  117. <script>var slideshow = remark.create({
  118. "ratio": "4:3",
  119. "countIncrementalSlides": false,
  120. "self-contained": true,
  121. "highlightLines": true
  122. });
  123. if (window.HTMLWidgets) slideshow.on('afterShowSlide', function (slide) {
  124. window.dispatchEvent(new Event('resize'));
  125. });
  126. (function(d) {
  127. var s = d.createElement("style"), r = d.querySelector(".remark-slide-scaler");
  128. if (!r) return;
  129. s.type = "text/css"; s.innerHTML = "@page {size: " + r.style.width + " " + r.style.height +"; }";
  130. d.head.appendChild(s);
  131. })(document);
  132. (function(d) {
  133. var el = d.getElementsByClassName("remark-slides-area");
  134. if (!el) return;
  135. var slide, slides = slideshow.getSlides(), els = el[0].children;
  136. for (var i = 1; i < slides.length; i++) {
  137. slide = slides[i];
  138. if (slide.properties.continued === "true" || slide.properties.count === "false") {
  139. els[i - 1].className += ' has-continuation';
  140. }
  141. }
  142. var s = d.createElement("style");
  143. s.type = "text/css"; s.innerHTML = "@media print { .has-continuation { display: none; } }";
  144. d.head.appendChild(s);
  145. })(document);
  146. // delete the temporary CSS (for displaying all slides initially) when the user
  147. // starts to view slides
  148. (function() {
  149. var deleted = false;
  150. slideshow.on('beforeShowSlide', function(slide) {
  151. if (deleted) return;
  152. var sheets = document.styleSheets, node;
  153. for (var i = 0; i < sheets.length; i++) {
  154. node = sheets[i].ownerNode;
  155. if (node.dataset["target"] !== "print-only") continue;
  156. node.parentNode.removeChild(node);
  157. }
  158. deleted = true;
  159. });
  160. })();
  161. // adds .remark-code-has-line-highlighted class to <pre> parent elements
  162. // of code chunks containing highlighted lines with class .remark-code-line-highlighted
  163. (function(d) {
  164. const hlines = d.querySelectorAll('.remark-code-line-highlighted');
  165. const preParents = [];
  166. const findPreParent = function(line, p = 0) {
  167. if (p > 1) return null; // traverse up no further than grandparent
  168. const el = line.parentElement;
  169. return el.tagName === "PRE" ? el : findPreParent(el, ++p);
  170. };
  171. for (let line of hlines) {
  172. let pre = findPreParent(line);
  173. if (pre && !preParents.includes(pre)) preParents.push(pre);
  174. }
  175. preParents.forEach(p => p.classList.add("remark-code-has-line-highlighted"));
  176. })(document);</script>
  177. <script>
  178. (function() {
  179. var links = document.getElementsByTagName('a');
  180. for (var i = 0; i < links.length; i++) {
  181. if (/^(https?:)?\/\//.test(links[i].getAttribute('href'))) {
  182. links[i].target = '_blank';
  183. }
  184. }
  185. })();
  186. </script>
  187. <script>
  188. slideshow._releaseMath = function(el) {
  189. var i, text, code, codes = el.getElementsByTagName('code');
  190. for (i = 0; i < codes.length;) {
  191. code = codes[i];
  192. if (code.parentNode.tagName !== 'PRE' && code.childElementCount === 0) {
  193. text = code.textContent;
  194. if (/^\\\((.|\s)+\\\)$/.test(text) || /^\\\[(.|\s)+\\\]$/.test(text) ||
  195. /^\$\$(.|\s)+\$\$$/.test(text) ||
  196. /^\\begin\{([^}]+)\}(.|\s)+\\end\{[^}]+\}$/.test(text)) {
  197. code.outerHTML = code.innerHTML; // remove <code></code>
  198. continue;
  199. }
  200. }
  201. i++;
  202. }
  203. };
  204. slideshow._releaseMath(document);
  205. </script>
  206. <!-- dynamically load mathjax for compatibility with self-contained -->
  207. <script>
  208. (function () {
  209. var script = document.createElement('script');
  210. script.type = 'text/javascript';
  211. script.src = 'https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML';
  212. if (location.protocol !== 'file:' && /^https?:/.test(script.src))
  213. script.src = script.src.replace(/^https?:/, '');
  214. document.getElementsByTagName('head')[0].appendChild(script);
  215. })();
  216. </script>
  217. </body>
  218. </html>