You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lab03-webscraping.html 20KB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. <!DOCTYPE html>
  2. <html lang="" xml:lang="">
  3. <head>
  4. <title>Web scraping</title>
  5. <meta charset="utf-8" />
  6. <meta name="author" content="Maxime Wack" />
  7. <script src="libs/header-attrs-2.1/header-attrs.js"></script>
  8. <link href="libs/remark-css-0.0.1/default.css" rel="stylesheet" />
  9. <script src="libs/htmlwidgets-1.5.1/htmlwidgets.js"></script>
  10. <script src="libs/jquery-1.12.4/jquery.min.js"></script>
  11. <link href="libs/datatables-css-0.0.0/datatables-crosstalk.css" rel="stylesheet" />
  12. <script src="libs/datatables-binding-0.13/datatables.js"></script>
  13. <link href="libs/dt-core-1.10.20/css/jquery.dataTables.min.css" rel="stylesheet" />
  14. <link href="libs/dt-core-1.10.20/css/jquery.dataTables.extra.css" rel="stylesheet" />
  15. <script src="libs/dt-core-1.10.20/js/jquery.dataTables.min.js"></script>
  16. <link href="libs/crosstalk-1.1.0.1/css/crosstalk.css" rel="stylesheet" />
  17. <script src="libs/crosstalk-1.1.0.1/js/crosstalk.min.js"></script>
  18. <link rel="stylesheet" href="css/my_style.css" type="text/css" />
  19. </head>
  20. <body>
  21. <textarea id="source">
  22. class: center, middle, title
  23. # UE Visualisation
  24. ### 2020-2021
  25. ## Dr. Maxime Wack
  26. ### AHU Informatique médicale
  27. #### Hôpital Européen Georges Pompidou, &lt;/br&gt; Université de Paris
  28. ---
  29. # Web scraping
  30. ### Utilisation de `httr` et `rvest`
  31. ## httr
  32. Permet de faire des requêtes réseau
  33. → interroger et télécharger directement depuis R
  34. ## rvest
  35. Extraction de données depuis des pages HTML
  36. ---
  37. # httr
  38. Télécharger une page wikipedia
  39. ```r
  40. GET("https://en.wikipedia.org/wiki/Comparison_of_operating_systems") -&gt; wiki
  41. ```
  42. ```
  43. ## Response [https://en.wikipedia.org/wiki/Comparison_of_operating_systems]
  44. ## Date: 2020-11-17 03:43
  45. ## Status: 200
  46. ## Content-Type: text/html; charset=UTF-8
  47. ## Size: 227 kB
  48. ## &lt;!DOCTYPE html&gt;
  49. ## &lt;html class="client-nojs" lang="en" dir="ltr"&gt;
  50. ## &lt;head&gt;
  51. ## &lt;meta charset="UTF-8"/&gt;
  52. ## &lt;title&gt;Comparison of operating systems - Wikipedia&lt;/title&gt;
  53. ## &lt;script&gt;document.documentElement.className="client-js";RLCONF={"wgBreakFrames...
  54. ## "Articles with unsourced statements from February 2007","Wikipedia articles n...
  55. ## "Q3345986"};RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"rea...
  56. ## "ext.wikimediaEvents","ext.navigationTiming","ext.uls.compactlinks","ext.uls....
  57. ## &lt;script&gt;(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.option...
  58. ## ...
  59. ```
  60. ---
  61. # Parsing HTML
  62. ```r
  63. wiki %&gt;%
  64. read_html -&gt; wiki_html
  65. ```
  66. ```
  67. ## {html_document}
  68. ## &lt;html class="client-nojs" lang="en" dir="ltr"&gt;
  69. ## [1] &lt;head&gt;\n&lt;meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
  70. ## [2] &lt;body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject ...
  71. ```
  72. ---
  73. # Sélecteurs CSS
  74. [W3Schools](https://www.w3schools.com/cssref/css_selectors.asp)
  75. ### Selecteurs permettant d'identifier un **nœud** précis dans le **DOM** (Document Object Model) d'une page HTML
  76. ### Permet de sélectionner par identifiant, classe, position dans la hiérarchie, position entre élements d'un même niveau, ou relativement entre élements
  77. ### Utiliser l'**inspecteur** des outils de développement du navigateur pour identifier les éléments à capturer
  78. ---
  79. # Sélecteurs CSS
  80. ```r
  81. wiki_html %&gt;%
  82. html_nodes(".wikitable")
  83. ```
  84. ```
  85. ## {xml_nodeset (4)}
  86. ## [1] &lt;table class="wikitable sortable" style="font-size: smaller; text-align: ...
  87. ## [2] &lt;table class="wikitable sortable" style="font-size: smaller; text-align: ...
  88. ## [3] &lt;table class="wikitable" style="font-size: smaller; text-align: center; w ...
  89. ## [4] &lt;table class="wikitable" style="font-size: smaller; text-align: center; w ...
  90. ```
  91. ```r
  92. wiki_html %&gt;%
  93. html_node("div + .wikitable")
  94. ```
  95. ```
  96. ## {html_node}
  97. ## &lt;table class="wikitable sortable" style="font-size: smaller; text-align: center; width: auto; sortby: name;"&gt;
  98. ## [1] &lt;tbody&gt;\n&lt;tr&gt;\n&lt;th&gt;Name\n&lt;/th&gt;\n&lt;th&gt;Creator\n&lt;/th&gt;\n&lt;th abbr="Initial rel ...
  99. ```
  100. ---
  101. # Extraction d'une table
  102. ```r
  103. wiki_html %&gt;%
  104. html_node("div + .wikitable") %&gt;%
  105. html_table -&gt; wikitable
  106. ```
  107. <div id="htmlwidget-bb3e3440d2054f364517" style="width:100%;height:auto;" class="datatables html-widget"></div>
  108. <script type="application/json" data-for="htmlwidget-bb3e3440d2054f364517">{"x":{"filter":"none","data":[["AIX","Android","AmigaOS classic","AmigaOS 4","ArcaOS","Chrome OS","DragonFly BSD","FreeBSD","Genode","GhostBSD","eComStation","EPOC32","GNU/Linux","Haiku","HP-UX","IBM i","Inferno","iOS","IRIX","Classic Mac OS","MVS","macOS","macOS Server","MPE","MCP","MPE/XL","MINIX 3","NetBSD","NetWare","NeXTSTEP","OpenBSD","OpenIndiana","OpenVMS","OS/360","OS/390","OS 2200","OS/2","Plan 9","QNX","Solaris","Symbian","Symbian platform","Windows Server (NT family)","Windows (NT family)","Windows (classic 9x family)","RISC iX","RISC OS","RISC OS 4","RISC OS 5","RISC OS 6","ZETA","STOP 6, XTS-400","ReactOS","TrueOS","VxWorks","z/OS","z/VSE","z/VM","HP NonStop","Name"],["IBM","Android, Inc., Google","Commodore International, Haage &amp; Partner, Hyperion Entertainment","Hyperion Entertainment","Arca Noae, LLC","Google","Matthew Dillon","The FreeBSD Project","Genode Labs","Eric Turgeon","Serenity Systems, Mensys BV","Psion PLC","Notable contributors include: Richard Stallman for GNU Project and Linus Torvalds for Linux and the Unixes they emulated; Red Hat, Debian Project See: Comparison of Linux distributions and Linux Kernel#Development","Haiku Inc.","HP (now Hewlett Packard Enterprise)","IBM","Bell Labs","Apple Inc.","SGI","Apple Inc.","IBM","Apple Inc.","Apple Inc.","HP","Unisys","HP","Andrew S. Tanenbaum","The NetBSD Project","Novell","NeXT","The OpenBSD Project","Many, based on software developed by Sun Microsystems and many others","DEC (now VSI)","IBM","IBM","Unisys","IBM and Microsoft","Bell Labs","QNX Software Systems","Sun (now Oracle Corporation)","Symbian Ltd.","Symbian Foundation","Microsoft","Microsoft","Microsoft","Acorn Computers","Acorn Computers","RISCOS Ltd, Pace plc","Castle Technology, RISC OS Open","RISCOS Ltd","yellowTAB","BAE Systems","ReactOS development team","PC-BSD Software","Wind River Systems","IBM","IBM","IBM","HP (now Hewlett Packard Enterprise)","Creator"],["1986","2008","1985","2004","2017","2009","2003","1993","2008","2009","2001","1996","1991 (kernel), See: Comparison of Linux distributions and History of Linux","2009","1983","1988","1997","2007","1988","1984","1972","2001","2001","1974","1961","1987","2005","1993","1985","1989","1995","2010","1977","1966","1995","1967 as Exec 8e","1987","1993","1982","1992","1998","2010 (initially 1998 as Symbian)","1993","1993","1995","1988","1987","1999","2002","2006","2005","2003","1996","2006","1987","2000","2007","2000","1974","Initial public release"],["UNIX System V Release 3","None","TRIPOS (as the disk operating component of AmigaOS)","AmigaOS classic","OS/2","Chromium OS","FreeBSD","386BSD","None","FreeBSD","OS/2","","None","BeOS R5","UNIX System V","OS/400","Plan 9","macOS","UNIX System V","None[g 2][g 3]","OS/360","NeXTSTEP, BSD","NeXTSTEP, BSD","None","None","MPE","Minix2","386BSD","S-Net","Unix","NetBSD 1.0","OpenSolaris","RSX-11M","None","MVS","Exec 8, OS 1100","MS-DOS","Unix","Unix, POSIX","SunOS","EPOC32","Symbian","OS/2, Windows 3.x and MS-DOS","OS/2, Windows 9x and MS-DOS","MS-DOS, Windows NT 3.5","BSD 4.3","Arthur, also the BBC Master OS","RISC OS","RISC OS 4","RISC OS 4","BeOS R5","STOP 5, XTS-300","Windows NT","FreeBSD[g 5]","VRTX","OS/390","VSE/ESA","VM","Guardian","Predecessor"],["7.2 TL4","11","3.9 BB2","4.1 Final Edition","5.0.6","86.0.4240.77","5.8.3","12.1","19.05[1]","19.10[2]","2.1","ER5","5.8.9 (kernel)","R1/Beta 2","11i v3 2020 Release","7.4","Fourth Edition","14.0.1","6.5.30","9.2.2","MVS/ESA SP - JES3 Version 5 R2.2","10.15.1 Catalina","10.12 / September 20, 2016","MPE-V","CP OS 19.0","7.5","3.3.0","9.1","6.5 SP8","3.3","6.8","2020.04","9.0","Operating System/360 R21.8","OS/390 Version 2 R10","CP OS 18 (Exec 49.2)","4.52","Fourth Edition","7.0.0","11.4","9.5","3.0.4","Windows Server 2019 (Version 10.0.17763)","Windows 10 (Version 2004)","Windows Me (Win 4.90.3000)","1.21c","3.71","4.39","5.24[5]","6.20","1.5","6.5","0.4.13","10.1[6]","7","Version 2.4 (V2R4)","6.2","7.1","H06.24/J06.13","Current stable version"],["2019, November","2020, September 8","2002-03-20","2014","2020, August 31","2020, October 13","2020, September 24","2019, November 4","2019, May 29","2019, October 26","2011, May 20","1999","2020, September 12 (kernel)","2020, June 9","May 2020","2019, April 23","2009, June 30","2020, September 24","2006","2000","1995, September 29","2019, October 29","2016, September 20","1988","2019, June","2002","2014","2020, October 18","2009, May 6","1995","2020, October 18","2020, May 5","2020, May 15","1972, August","2000, September 29","2018, July 18","2001","2003 (except for minor later updates)","2017, March","2018, August 28","2009","2010","2018, October 2","2020, May 27","2000","1993","1997","2004","2018","2009","2007","2008, August","2020, April 9","2014, November 16","2014 March","2019, September 30","2017, December 1","2018, September 21 [7]","2012","Release date"],["Bundled with hardware","Free","Discontinued; Bundled with hardware up to version 3.0 (Amiga International hardware came with 3.1); versions 2.1, 3.0, 3.1, 3.5, 3.9 also available as separate packages","4.0 bundled with hardware; 4.0 for classic and 4.1 available as standalone package at €29","Personal edition US$129.00Commercial edition US$229.00","Bundled with hardware, 32-bit edition dropped","Free","Free","Free, source code only","Free","Home-student edition (max. three per site) $145.00business edition $290.00","Discontinued; Commercial","Free","Free","US$400","Bundled with hardware","Free","Bundled with hardware and free updates given to most existing users, subject to hardware requirements","Discontinued; Bundled with hardware","Discontinued; Was bundled with 68k and PowerPC Macs;\nversions 7-9 sold as retail upgrades[g 4]","Bundled with hardware","Bundled with hardware; No-cost update via Mac App Store for users of Mac OS X 10.6 or later, assuming hardware requirements are met","Previously bundled with hardware; No longer a separate operating system, but a group of services installed atop any current version of Mac OS X; US$19.99 on the Mac App Store","Discontinued; Was bundled with HP-3000 CISC hardware \"Classic\"","Bundled with hardware","Discontinued; Was bundled with HP-3000 PA-RISC hardware","Free","Free","Superseded by Novell Open Enterprise Server; Was US$184 (equivalent to $219.28 in 2019) (one-user)","Discontinued; Was bundled with hardware, then sold separately","Free","Free","Commercial, free non-commercial use","Bundled with hardware","Bundled with hardware","Bundled with hardware","Discontinued (see ArcaOS successor); Was US$300 (equivalent to $433.17 in 2019)","Free","Bundled with BlackBerry 10 and PlayBook devices. Commercial; an academic version exists that needs authorization code before installing","Commercial; (but free/no-cost perpetual license when used \"for the purpose of developing, testing, prototyping and demonstrating your applications\"[3])","Discontinued; Commercial","Free","US$1050 5 CALs server; other editions dependent on number of CALs purchased","Windows 10 Home US$119, Windows 10 Pro US$199[4]","Discontinued","Discontinued; Was bundled with hardware","Discontinued; Was bundled with hardware","Bundled with hardware, then sold separately at £70 (US$127)","Free","Bundled with hardware, then sold separately at £70 (US$127)","Discontinued","US$60,000 (equivalent to $71,249 in 2019)+; bundled with XTS hardware and OEM licensed","Free","Free","Paid","Monthly license fee, about US$130 and up","Monthly license fee","Monthly license fee","Non-free","Cost, availability"],["Proprietary","Apache 2.0, GNU GPLv2","Proprietary, open source clone available under AROS Public License","Proprietary","Proprietary","Proprietary: Google OS Terms of Service","BSD","BSD","AGPL","BSD","Proprietary","Proprietary","GNU GPLv2 (kernel)","MIT","Proprietary","Proprietary","MIT, GNU GPL, GNU LGPL, LPL","Proprietary higher level API layers; open source core system (ARM versions): APSL, GNU GPL, others","Proprietary","Proprietary","Proprietary","Proprietary higher level API layers; open source core system (Intel-PowerPC versions): APSL, GNU GPL, others","Proprietary higher level API layers; open source core system (Intel-PowerPC versions): APSL, GNU GPL, others","Proprietary","Proprietary","Proprietary","BSD","BSD","Proprietary","Proprietary","ISC","CDDL","Proprietary","Proprietary","Proprietary","Proprietary","Proprietary","LPL","Proprietary","CDDL","Proprietary","EPL","Proprietary; Source-available","Proprietary; Source-available","Proprietary","Proprietary","Proprietary","Proprietary","Apache License","Proprietary","Proprietary","Proprietary","GNU GPL, GNU LGPL","BSD","Proprietary","Proprietary","Proprietary","Proprietary","Proprietary","Preferred license[g 1]"],["Server, NAS, workstation","Smartphone, tablet computer, education","Workstation, personal computer","Workstation, personal computer","Server, workstation, personal computer","Chromebook, Chromebox, Chromebase and tablets","Server, workstation, NAS, embedded","Server, workstation, NAS, embedded","Desktop, Embedded, Server","Desktop, workstation","Server, workstation, personal computer","PDA","See: Comparison of Linux distributions","Personal computer","Server","Server","NAS, server, embedded","Smartphone, music player, tablet computer","Server, workstation","Workstation, personal computer","IBM mainframe","Workstation, personal computer, embedded","Server","Server","Server","Server","Workstation","NAS, server, workstation, embedded","Server","Workstation","Server, NAS, workstation, embedded","Server, workstation","Server, workstation","IBM mainframe","IBM mainframe","Server","Personal computer, server","Workstation, server, embedded, HPC","Automotive, medical, smartphones, consumer, industrial, embedded, safety","Server, workstation","Phones","embedded","Server, NAS, embedded","Workstation, personal computer, media center, Tablet PC, embedded","Personal computer, media center","Workstation","Education, personal computer","Education, personal computer","Education, personal computer","Education, personal computer","Personal computer, media center, workstation","Server, workstation","Workstation, personal computer","Personal computer, workstation, server","Embedded Real-time systems","IBM mainframe","IBM mainframe","IBM mainframe","HP Nonstop Servers","Target system type"]],"container":"<table class=\"display\">\n <thead>\n <tr>\n <th>Name<\/th>\n <th>Creator<\/th>\n <th>Initial public release<\/th>\n <th>Predecessor<\/th>\n <th>Current stable version<\/th>\n <th>Release date<\/th>\n <th>Cost, availability<\/th>\n <th>Preferred license[g 1]<\/th>\n <th>Target system type<\/th>\n <\/tr>\n <\/thead>\n<\/table>","options":{"paging":false,"info":false,"searching":false,"order":[],"autoWidth":false,"orderClasses":false}},"evals":[],"jsHooks":[]}</script>
  109. ---
  110. # Exercices
  111. ### Transformer cette table en forme normale
  112. ### Extraire la table avec les informations techniques
  113. ### Identifier les OS libres fonctionnant avec un microkernel
  114. </textarea>
  115. <style data-target="print-only">@media screen {.remark-slide-container{display:block;}.remark-slide-scaler{box-shadow:none;}}</style>
  116. <script src="https://remarkjs.com/downloads/remark-latest.min.js"></script>
  117. <script src="addons/macros.js"></script>
  118. <script>var slideshow = remark.create({
  119. "ratio": "4:3",
  120. "countIncrementalSlides": false,
  121. "self-contained": true,
  122. "highlightLines": true
  123. });
  124. if (window.HTMLWidgets) slideshow.on('afterShowSlide', function (slide) {
  125. window.dispatchEvent(new Event('resize'));
  126. });
  127. (function(d) {
  128. var s = d.createElement("style"), r = d.querySelector(".remark-slide-scaler");
  129. if (!r) return;
  130. s.type = "text/css"; s.innerHTML = "@page {size: " + r.style.width + " " + r.style.height +"; }";
  131. d.head.appendChild(s);
  132. })(document);
  133. (function(d) {
  134. var el = d.getElementsByClassName("remark-slides-area");
  135. if (!el) return;
  136. var slide, slides = slideshow.getSlides(), els = el[0].children;
  137. for (var i = 1; i < slides.length; i++) {
  138. slide = slides[i];
  139. if (slide.properties.continued === "true" || slide.properties.count === "false") {
  140. els[i - 1].className += ' has-continuation';
  141. }
  142. }
  143. var s = d.createElement("style");
  144. s.type = "text/css"; s.innerHTML = "@media print { .has-continuation { display: none; } }";
  145. d.head.appendChild(s);
  146. })(document);
  147. // delete the temporary CSS (for displaying all slides initially) when the user
  148. // starts to view slides
  149. (function() {
  150. var deleted = false;
  151. slideshow.on('beforeShowSlide', function(slide) {
  152. if (deleted) return;
  153. var sheets = document.styleSheets, node;
  154. for (var i = 0; i < sheets.length; i++) {
  155. node = sheets[i].ownerNode;
  156. if (node.dataset["target"] !== "print-only") continue;
  157. node.parentNode.removeChild(node);
  158. }
  159. deleted = true;
  160. });
  161. })();
  162. (function() {
  163. "use strict"
  164. // Replace <script> tags in slides area to make them executable
  165. var scripts = document.querySelectorAll(
  166. '.remark-slides-area .remark-slide-container script'
  167. );
  168. if (!scripts.length) return;
  169. for (var i = 0; i < scripts.length; i++) {
  170. var s = document.createElement('script');
  171. var code = document.createTextNode(scripts[i].textContent);
  172. s.appendChild(code);
  173. var scriptAttrs = scripts[i].attributes;
  174. for (var j = 0; j < scriptAttrs.length; j++) {
  175. s.setAttribute(scriptAttrs[j].name, scriptAttrs[j].value);
  176. }
  177. scripts[i].parentElement.replaceChild(s, scripts[i]);
  178. }
  179. })();
  180. (function() {
  181. var links = document.getElementsByTagName('a');
  182. for (var i = 0; i < links.length; i++) {
  183. if (/^(https?:)?\/\//.test(links[i].getAttribute('href'))) {
  184. links[i].target = '_blank';
  185. }
  186. }
  187. })();
  188. // adds .remark-code-has-line-highlighted class to <pre> parent elements
  189. // of code chunks containing highlighted lines with class .remark-code-line-highlighted
  190. (function(d) {
  191. const hlines = d.querySelectorAll('.remark-code-line-highlighted');
  192. const preParents = [];
  193. const findPreParent = function(line, p = 0) {
  194. if (p > 1) return null; // traverse up no further than grandparent
  195. const el = line.parentElement;
  196. return el.tagName === "PRE" ? el : findPreParent(el, ++p);
  197. };
  198. for (let line of hlines) {
  199. let pre = findPreParent(line);
  200. if (pre && !preParents.includes(pre)) preParents.push(pre);
  201. }
  202. preParents.forEach(p => p.classList.add("remark-code-has-line-highlighted"));
  203. })(document);</script>
  204. <script>
  205. slideshow._releaseMath = function(el) {
  206. var i, text, code, codes = el.getElementsByTagName('code');
  207. for (i = 0; i < codes.length;) {
  208. code = codes[i];
  209. if (code.parentNode.tagName !== 'PRE' && code.childElementCount === 0) {
  210. text = code.textContent;
  211. if (/^\\\((.|\s)+\\\)$/.test(text) || /^\\\[(.|\s)+\\\]$/.test(text) ||
  212. /^\$\$(.|\s)+\$\$$/.test(text) ||
  213. /^\\begin\{([^}]+)\}(.|\s)+\\end\{[^}]+\}$/.test(text)) {
  214. code.outerHTML = code.innerHTML; // remove <code></code>
  215. continue;
  216. }
  217. }
  218. i++;
  219. }
  220. };
  221. slideshow._releaseMath(document);
  222. </script>
  223. <!-- dynamically load mathjax for compatibility with self-contained -->
  224. <script>
  225. (function () {
  226. var script = document.createElement('script');
  227. script.type = 'text/javascript';
  228. script.src = 'https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML';
  229. if (location.protocol !== 'file:' && /^https?:/.test(script.src))
  230. script.src = script.src.replace(/^https?:/, '');
  231. document.getElementsByTagName('head')[0].appendChild(script);
  232. })();
  233. </script>
  234. </body>
  235. </html>