001package ezvcard.io.html; 002 003import static ezvcard.util.HtmlUtils.isChildOf; 004 005import java.io.File; 006import java.io.IOException; 007import java.io.InputStream; 008import java.io.Reader; 009import java.net.MalformedURLException; 010import java.net.URL; 011import java.util.ArrayList; 012import java.util.Iterator; 013import java.util.List; 014import java.util.Set; 015 016import org.jsoup.Jsoup; 017import org.jsoup.nodes.Document; 018import org.jsoup.nodes.Element; 019import org.jsoup.select.Elements; 020 021import ezvcard.VCard; 022import ezvcard.VCardVersion; 023import ezvcard.io.CannotParseException; 024import ezvcard.io.EmbeddedVCardException; 025import ezvcard.io.ParseWarning; 026import ezvcard.io.SkipMeException; 027import ezvcard.io.StreamReader; 028import ezvcard.io.scribe.RawPropertyScribe; 029import ezvcard.io.scribe.VCardPropertyScribe; 030import ezvcard.property.Categories; 031import ezvcard.property.Email; 032import ezvcard.property.Impp; 033import ezvcard.property.Label; 034import ezvcard.property.Nickname; 035import ezvcard.property.RawProperty; 036import ezvcard.property.Telephone; 037import ezvcard.property.Url; 038import ezvcard.property.VCardProperty; 039import ezvcard.util.Gobble; 040import ezvcard.util.IOUtils; 041 042/* 043 Copyright (c) 2012-2018, Michael Angstadt 044 All rights reserved. 045 046 Redistribution and use in source and binary forms, with or without 047 modification, are permitted provided that the following conditions are met: 048 049 1. Redistributions of source code must retain the above copyright notice, this 050 list of conditions and the following disclaimer. 051 2. Redistributions in binary form must reproduce the above copyright notice, 052 this list of conditions and the following disclaimer in the documentation 053 and/or other materials provided with the distribution. 054 055 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 056 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 057 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 058 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 059 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 060 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 061 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 062 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 063 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 064 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 065 066 The views and conclusions contained in the software and documentation are those 067 of the authors and should not be interpreted as representing official policies, 068 either expressed or implied, of the FreeBSD Project. 069 */ 070 071/** 072 * <p> 073 * Parses {@link VCard} objects from an HTML page (hCard format). 074 * </p> 075 * <p> 076 * <b>Example:</b> 077 * </p> 078 * 079 * <pre class="brush:java"> 080 * URL url = new URL("http://example.com"); 081 * HCardParser parser = new HCardParser(url); 082 * List<VCard> vcards = parser.parseAll(); 083 * </pre> 084 * @author Michael Angstadt 085 * @see <a href="http://microformats.org/wiki/hcard">http://microformats.org/ 086 * wiki/hcard</a> 087 */ 088public class HCardParser extends StreamReader { 089 private final String pageUrl; 090 private final Elements vcardElements; 091 private final Iterator<Element> vcardElementsIt; 092 private final List<Label> labels = new ArrayList<Label>(); 093 094 private VCard vcard; 095 private Elements embeddedVCards = new Elements(); 096 private Nickname nickname; 097 private Categories categories; 098 099 private final String urlPropertyName = index.getPropertyScribe(Url.class).getPropertyName().toLowerCase(); 100 private final String categoriesName = index.getPropertyScribe(Categories.class).getPropertyName().toLowerCase(); 101 private final String emailName = index.getPropertyScribe(Email.class).getPropertyName().toLowerCase(); 102 private final String telName = index.getPropertyScribe(Telephone.class).getPropertyName().toLowerCase(); 103 104 /** 105 * Creates an hCard document. 106 * @param url the URL of the webpage 107 * @throws IOException if there's a problem loading the webpage 108 */ 109 public HCardParser(URL url) throws IOException { 110 this(Jsoup.parse(url, 30000), url.toString()); 111 } 112 113 /** 114 * Creates an hCard document. 115 * @param in the input stream to the HTML page 116 * @throws IOException if there's a problem reading the HTML page 117 */ 118 public HCardParser(InputStream in) throws IOException { 119 this(in, null); 120 } 121 122 /** 123 * Creates an hCard document. 124 * @param in the input stream to the HTML page 125 * @param pageUrl the original URL of the HTML page (used to resolve 126 * relative links) 127 * @throws IOException if there's a problem reading the HTML page 128 */ 129 public HCardParser(InputStream in, String pageUrl) throws IOException { 130 this((pageUrl == null) ? Jsoup.parse(in, null, "") : Jsoup.parse(in, null, pageUrl), pageUrl); 131 } 132 133 /** 134 * Creates an hCard document. 135 * @param file the HTML file 136 * @throws IOException if there's a problem reading the HTML file 137 */ 138 public HCardParser(File file) throws IOException { 139 this(file, null); 140 } 141 142 /** 143 * Creates an hCard document. 144 * @param file the HTML file 145 * @param pageUrl the original URL of the HTML page (used to resolve 146 * relative links) 147 * @throws IOException if there's a problem reading the HTML file 148 */ 149 public HCardParser(File file, String pageUrl) throws IOException { 150 this((pageUrl == null) ? Jsoup.parse(file, null, "") : Jsoup.parse(file, null, pageUrl), pageUrl); 151 } 152 153 /** 154 * Creates an hCard document. 155 * @param reader the input stream to the HTML page 156 * @throws IOException if there's a problem reading the HTML page 157 */ 158 public HCardParser(Reader reader) throws IOException { 159 this(reader, null); 160 } 161 162 /** 163 * Creates an hCard document. 164 * @param reader the input stream to the HTML page 165 * @param pageUrl the original URL of the HTML page (used to resolve 166 * relative links) 167 * @throws IOException if there's a problem reading the HTML page 168 */ 169 public HCardParser(Reader reader, String pageUrl) throws IOException { 170 this(new Gobble(reader).asString(), pageUrl); 171 } 172 173 /** 174 * Creates an hCard document. 175 * @param html the HTML page 176 */ 177 public HCardParser(String html) { 178 this(html, null); 179 } 180 181 /** 182 * Creates an hCard document. 183 * @param html the HTML page 184 * @param pageUrl the original URL of the HTML page (used to resolve 185 * relative links) 186 */ 187 public HCardParser(String html, String pageUrl) { 188 this((pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl), pageUrl); 189 } 190 191 /** 192 * Creates an hCard document. 193 * @param document the HTML page 194 */ 195 public HCardParser(Document document) { 196 this(document, null); 197 } 198 199 /** 200 * Creates an hCard document. 201 * @param document the HTML page 202 * @param pageUrl the original URL of the HTML page (used to resolve 203 * relative links) 204 */ 205 public HCardParser(Document document, String pageUrl) { 206 this.pageUrl = pageUrl; 207 208 String anchor = null; 209 if (pageUrl != null) { 210 try { 211 URL url = new URL(pageUrl); 212 anchor = url.getRef(); 213 } catch (MalformedURLException e) { 214 anchor = null; 215 } 216 } 217 218 Element searchUnder = null; 219 if (anchor != null) { 220 searchUnder = document.getElementById(anchor); 221 } 222 if (searchUnder == null) { 223 searchUnder = document; 224 } 225 226 vcardElements = searchUnder.getElementsByClass("vcard"); 227 228 //remove nested vcard elements 229 Iterator<Element> it = vcardElements.iterator(); 230 while (it.hasNext()) { 231 Element element = it.next(); 232 if (isChildOf(element, vcardElements)) { 233 it.remove(); 234 } 235 } 236 237 vcardElementsIt = vcardElements.iterator(); 238 } 239 240 /** 241 * Constructor for reading embedded vCards. 242 * @param embeddedVCard the HTML element of the embedded vCard 243 * @param pageUrl the original URL of the HTML page 244 */ 245 private HCardParser(Element embeddedVCard, String pageUrl) { 246 this.pageUrl = pageUrl; 247 vcardElements = new Elements(embeddedVCard); 248 vcardElementsIt = vcardElements.iterator(); 249 } 250 251 @Override 252 public VCard readNext() { 253 try { 254 return super.readNext(); 255 } catch (IOException e) { 256 //will not be thrown because reading from DOM 257 throw new RuntimeException(e); 258 } 259 } 260 261 @Override 262 protected VCard _readNext() { 263 if (!vcardElementsIt.hasNext()) { 264 return null; 265 } 266 267 context.setVersion(VCardVersion.V3_0); 268 parseVCardElement(vcardElementsIt.next()); 269 return vcard; 270 } 271 272 private void parseVCardElement(Element vcardElement) { 273 labels.clear(); 274 nickname = null; 275 categories = null; 276 277 vcard = new VCard(); 278 vcard.setVersion(VCardVersion.V3_0); 279 if (pageUrl != null) { 280 vcard.addSource(pageUrl); 281 } 282 283 //visit all descendant nodes, depth-first 284 for (Element child : vcardElement.children()) { 285 visit(child); 286 } 287 288 //assign labels to their addresses 289 assignLabels(vcard, labels); 290 } 291 292 private void visit(Element element) { 293 boolean visitChildren = true; 294 Set<String> classNames = element.classNames(); 295 for (String className : classNames) { 296 className = className.toLowerCase(); 297 298 //give special treatment to certain URLs 299 if (urlPropertyName.equals(className)) { 300 String href = element.attr("href"); 301 if (href.length() > 0) { 302 if (!classNames.contains(emailName) && href.matches("(?i)mailto:.*")) { 303 className = emailName; 304 } else if (!classNames.contains(telName) && href.matches("(?i)tel:.*")) { 305 className = telName; 306 } else { 307 //try parsing as IMPP 308 VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(Impp.class); 309 310 context.getWarnings().clear(); 311 context.setPropertyName(scribe.getPropertyName()); 312 try { 313 VCardProperty property = scribe.parseHtml(new HCardElement(element), context); 314 vcard.addProperty(property); 315 warnings.addAll(context.getWarnings()); 316 continue; 317 } catch (SkipMeException e) { 318 //URL is not an instant messenger URL 319 } catch (CannotParseException e) { 320 //URL is not an instant messenger URL 321 } 322 } 323 } 324 } 325 326 //hCard uses a different name for the CATEGORIES property 327 if ("category".equals(className)) { 328 className = categoriesName; 329 } 330 331 VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(className); 332 if (scribe == null) { 333 //if no scribe is found, and the class name doesn't start with "x-", then it must be an arbitrary CSS class that has nothing to do with vCard 334 if (!className.startsWith("x-")) { 335 continue; 336 } 337 scribe = new RawPropertyScribe(className); 338 } 339 340 context.getWarnings().clear(); 341 context.setPropertyName(scribe.getPropertyName()); 342 343 VCardProperty property; 344 try { 345 property = scribe.parseHtml(new HCardElement(element), context); 346 warnings.addAll(context.getWarnings()); 347 348 //LABELs must be treated specially so they can be matched up with their ADRs 349 if (property instanceof Label) { 350 labels.add((Label) property); 351 continue; 352 } 353 354 //add all NICKNAMEs to the same type object 355 if (property instanceof Nickname) { 356 Nickname nn = (Nickname) property; 357 if (nickname == null) { 358 nickname = nn; 359 vcard.addProperty(nickname); 360 } else { 361 nickname.getValues().addAll(nn.getValues()); 362 } 363 continue; 364 } 365 366 //add all CATEGORIES to the same type object 367 if (property instanceof Categories) { 368 Categories c = (Categories) property; 369 if (categories == null) { 370 categories = c; 371 vcard.addProperty(categories); 372 } else { 373 categories.getValues().addAll(c.getValues()); 374 } 375 continue; 376 } 377 } catch (SkipMeException e) { 378 //@formatter:off 379 warnings.add(new ParseWarning.Builder(context) 380 .message(22, e.getMessage()) 381 .build() 382 ); 383 //@formatter:on 384 continue; 385 } catch (CannotParseException e) { 386 //@formatter:off 387 warnings.add(new ParseWarning.Builder(context) 388 .message(e) 389 .build() 390 ); 391 //@formatter:on 392 393 property = new RawProperty(className, element.outerHtml()); 394 } catch (EmbeddedVCardException e) { 395 if (isChildOf(element, embeddedVCards)) { 396 //prevents multiple-nested embedded elements from overwriting each other 397 continue; 398 } 399 400 property = e.getProperty(); 401 402 embeddedVCards.add(element); 403 HCardParser embeddedReader = new HCardParser(element, pageUrl); 404 try { 405 VCard embeddedVCard = embeddedReader.readNext(); 406 e.injectVCard(embeddedVCard); 407 } finally { 408 warnings.addAll(embeddedReader.getWarnings()); 409 IOUtils.closeQuietly(embeddedReader); 410 } 411 visitChildren = false; 412 } 413 414 vcard.addProperty(property); 415 } 416 417 if (visitChildren) { 418 for (Element child : element.children()) { 419 visit(child); 420 } 421 } 422 } 423 424 public void close() { 425 //empty 426 } 427}