001package ezvcard.io.html; 002 003import static ezvcard.util.HtmlUtils.isChildOf; 004 005import java.io.IOException; 006import java.io.InputStream; 007import java.io.Reader; 008import java.io.UncheckedIOException; 009import java.net.URL; 010import java.nio.file.Path; 011import java.time.Duration; 012import java.util.ArrayList; 013import java.util.Iterator; 014import java.util.List; 015import java.util.Set; 016import java.util.stream.Collectors; 017 018import org.jsoup.Jsoup; 019import org.jsoup.nodes.Document; 020import org.jsoup.nodes.Element; 021import org.jsoup.select.Elements; 022 023import ezvcard.VCard; 024import ezvcard.VCardVersion; 025import ezvcard.io.CannotParseException; 026import ezvcard.io.EmbeddedVCardException; 027import ezvcard.io.ParseWarning; 028import ezvcard.io.SkipMeException; 029import ezvcard.io.StreamReader; 030import ezvcard.io.scribe.RawPropertyScribe; 031import ezvcard.io.scribe.VCardPropertyScribe; 032import ezvcard.property.Categories; 033import ezvcard.property.Email; 034import ezvcard.property.Impp; 035import ezvcard.property.Label; 036import ezvcard.property.Nickname; 037import ezvcard.property.RawProperty; 038import ezvcard.property.Telephone; 039import ezvcard.property.Url; 040import ezvcard.property.VCardProperty; 041import ezvcard.util.Gobble; 042import ezvcard.util.HtmlUtils; 043import ezvcard.util.IOUtils; 044 045/* 046 Copyright (c) 2012-2026, Michael Angstadt 047 All rights reserved. 048 049 Redistribution and use in source and binary forms, with or without 050 modification, are permitted provided that the following conditions are met: 051 052 1. Redistributions of source code must retain the above copyright notice, this 053 list of conditions and the following disclaimer. 054 2. Redistributions in binary form must reproduce the above copyright notice, 055 this list of conditions and the following disclaimer in the documentation 056 and/or other materials provided with the distribution. 057 058 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 059 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 060 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 061 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 062 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 063 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 064 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 065 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 066 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 067 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 068 069 The views and conclusions contained in the software and documentation are those 070 of the authors and should not be interpreted as representing official policies, 071 either expressed or implied, of the FreeBSD Project. 072 */ 073 074/** 075 * <p> 076 * Parses {@link VCard} objects from an HTML page (hCard format). 077 * </p> 078 * <p> 079 * <b>Example:</b> 080 * </p> 081 * 082 * <pre class="brush:java"> 083 * URL url = new URL("http://example.com"); 084 * HCardParser parser = new HCardParser(url); 085 * List<VCard> vcards = parser.parseAll(); 086 * </pre> 087 * @author Michael Angstadt 088 * @see <a href="http://microformats.org/wiki/hcard">http://microformats.org/ 089 * wiki/hcard</a> 090 */ 091public class HCardParser extends StreamReader { 092 private static final Duration urlTimeout = Duration.ofSeconds(30); 093 094 private final String pageUrl; 095 private final Iterator<Element> vcardElementsIt; 096 private final List<Label> labels = new ArrayList<>(); 097 098 private VCard vcard; 099 private Elements embeddedVCards = new Elements(); 100 private Nickname nickname; 101 private Categories categories; 102 103 private final String urlPropertyName = index.getPropertyScribe(Url.class).getPropertyName().toLowerCase(); 104 private final String categoriesName = index.getPropertyScribe(Categories.class).getPropertyName().toLowerCase(); 105 private final String emailName = index.getPropertyScribe(Email.class).getPropertyName().toLowerCase(); 106 private final String telName = index.getPropertyScribe(Telephone.class).getPropertyName().toLowerCase(); 107 108 /** 109 * Creates an hCard document. 110 * @param url the URL of the webpage 111 * @throws IOException if there's a problem loading the webpage 112 */ 113 public HCardParser(URL url) throws IOException { 114 this(Jsoup.parse(url, (int) urlTimeout.toMillis()), url.toString()); 115 } 116 117 /** 118 * Creates an hCard document. 119 * @param in the input stream to the HTML page 120 * @throws IOException if there's a problem reading the HTML page 121 */ 122 public HCardParser(InputStream in) throws IOException { 123 this(in, null); 124 } 125 126 /** 127 * Creates an hCard document. 128 * @param in the input stream to the HTML page 129 * @param pageUrl the original URL of the HTML page (used to resolve 130 * relative links) 131 * @throws IOException if there's a problem reading the HTML page 132 */ 133 public HCardParser(InputStream in, String pageUrl) throws IOException { 134 this((pageUrl == null) ? Jsoup.parse(in, null, "") : Jsoup.parse(in, null, pageUrl), pageUrl); 135 } 136 137 /** 138 * Creates an hCard document. 139 * @param file the HTML file 140 * @throws IOException if there's a problem reading the HTML file 141 */ 142 public HCardParser(Path file) throws IOException { 143 this(file, null); 144 } 145 146 /** 147 * Creates an hCard document. 148 * @param file the HTML file 149 * @param pageUrl the original URL of the HTML page (used to resolve 150 * relative links) 151 * @throws IOException if there's a problem reading the HTML file 152 */ 153 public HCardParser(Path file, String pageUrl) throws IOException { 154 this((pageUrl == null) ? Jsoup.parse(file.toFile(), null, "") : Jsoup.parse(file.toFile(), null, pageUrl), pageUrl); 155 } 156 157 /** 158 * Creates an hCard document. 159 * @param reader the input stream to the HTML page 160 * @throws IOException if there's a problem reading the HTML page 161 */ 162 public HCardParser(Reader reader) throws IOException { 163 this(reader, null); 164 } 165 166 /** 167 * Creates an hCard document. 168 * @param reader the input stream to the HTML page 169 * @param pageUrl the original URL of the HTML page (used to resolve 170 * relative links) 171 * @throws IOException if there's a problem reading the HTML page 172 */ 173 public HCardParser(Reader reader, String pageUrl) throws IOException { 174 this(new Gobble(reader).asString(), pageUrl); 175 } 176 177 /** 178 * Creates an hCard document. 179 * @param html the HTML page 180 */ 181 public HCardParser(String html) { 182 this(html, null); 183 } 184 185 /** 186 * Creates an hCard document. 187 * @param html the HTML page 188 * @param pageUrl the original URL of the HTML page (used to resolve 189 * relative links) 190 */ 191 public HCardParser(String html, String pageUrl) { 192 this((pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl), pageUrl); 193 } 194 195 /** 196 * Creates an hCard document. 197 * @param document the HTML page 198 */ 199 public HCardParser(Document document) { 200 this(document, null); 201 } 202 203 /** 204 * Creates an hCard document. 205 * @param document the HTML page 206 * @param pageUrl the original URL of the HTML page (used to resolve 207 * relative links) 208 */ 209 public HCardParser(Document document, String pageUrl) { 210 this.pageUrl = pageUrl; 211 212 String anchor = (pageUrl == null) ? null : HtmlUtils.getAnchorFromUrl(pageUrl); 213 214 Element searchUnder = (anchor == null) ? null : document.getElementById(anchor); 215 if (searchUnder == null) { 216 searchUnder = document; 217 } 218 219 /* 220 * Nested vCards also show up in this list as separate list items. For 221 * example, if the HTML document has one vCard and that vCard has one 222 * nested vCard (i.e. AGENT property), this list will have two elements. 223 * 224 * Exclude the nested vCards from being processed as their own, 225 * independent vCards. 226 */ 227 Elements vcardElementsIncludingNested = searchUnder.getElementsByClass("vcard"); 228 229 //@formatter:off 230 List<Element> vcardElementsWithoutNested = vcardElementsIncludingNested.stream() 231 .filter(element -> !isChildOf(element, vcardElementsIncludingNested)) 232 .collect(Collectors.toList()); 233 //@formatter:on 234 235 vcardElementsIt = vcardElementsWithoutNested.iterator(); 236 } 237 238 /** 239 * Constructor for reading embedded vCards. 240 * @param embeddedVCard the HTML element of the embedded vCard 241 * @param pageUrl the original URL of the HTML page 242 */ 243 private HCardParser(Element embeddedVCard, String pageUrl) { 244 this.pageUrl = pageUrl; 245 vcardElementsIt = new Elements(embeddedVCard).iterator(); 246 } 247 248 @Override 249 public VCard readNext() { 250 try { 251 return super.readNext(); 252 } catch (IOException e) { 253 //will not be thrown because reading from DOM 254 throw new UncheckedIOException(e); 255 } 256 } 257 258 @Override 259 protected VCard _readNext() { 260 if (!vcardElementsIt.hasNext()) { 261 return null; 262 } 263 264 context.setVersion(VCardVersion.V3_0); 265 parseVCardElement(vcardElementsIt.next()); 266 return vcard; 267 } 268 269 private void parseVCardElement(Element vcardElement) { 270 labels.clear(); 271 nickname = null; 272 categories = null; 273 274 vcard = new VCard(); 275 vcard.setVersion(VCardVersion.V3_0); 276 if (pageUrl != null) { 277 vcard.addSource(pageUrl); 278 } 279 280 //visit all descendant nodes, depth-first 281 vcardElement.children().forEach(this::visit); 282 283 //assign labels to their addresses 284 assignLabels(vcard, labels); 285 } 286 287 private void visit(Element element) { 288 int embeddedVCardCount = embeddedVCards.size(); 289 290 Set<String> classNames = adjustClassNames(element); 291 292 classNames.forEach(className -> parseProperty(element, className)); 293 294 boolean noEmbeddedVCardsWereAdded = (embeddedVCardCount == embeddedVCards.size()); 295 if (noEmbeddedVCardsWereAdded) { 296 //do not visit children if there are any embedded vCards 297 element.children().forEach(this::visit); 298 } 299 } 300 301 private Set<String> adjustClassNames(Element element) { 302 //@formatter:off 303 Set<String> classNamesToLower = element.classNames().stream() 304 .map(String::toLowerCase) 305 .collect(Collectors.toSet()); 306 307 return classNamesToLower.stream() 308 .map(className -> adjustClassName(className, element, classNamesToLower)) 309 .collect(Collectors.toSet()); 310 //@formatter:on 311 } 312 313 private String adjustClassName(String className, Element element, Set<String> origClassNames) { 314 /* 315 * hCard uses a different name for the CATEGORIES property. 316 */ 317 if ("category".equals(className)) { 318 return categoriesName; 319 } 320 321 /* 322 * Give special treatment to certain URLs. 323 */ 324 if (urlPropertyName.equals(className)) { 325 String href = element.attr("href"); 326 if (!origClassNames.contains(emailName) && href.matches("(?i)mailto:.*")) { 327 return emailName; 328 } 329 if (!origClassNames.contains(telName) && href.matches("(?i)tel:.*")) { 330 return telName; 331 } 332 } 333 334 return className; 335 } 336 337 private VCardProperty tryToParseAsImpp(Element element) { 338 String href = element.attr("href"); 339 if (href.isEmpty()) { 340 return null; 341 } 342 343 VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(Impp.class); 344 345 context.getWarnings().clear(); 346 context.setPropertyName(scribe.getPropertyName()); 347 try { 348 VCardProperty property = scribe.parseHtml(new HCardElement(element), context); 349 warnings.addAll(context.getWarnings()); 350 return property; 351 } catch (SkipMeException | CannotParseException e) { 352 //URL is not an instant messenger URL 353 return null; 354 } 355 } 356 357 private VCardPropertyScribe<? extends VCardProperty> getPropertyScribe(String className) { 358 VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(className); 359 360 if (scribe == null) { 361 /* 362 * If no scribe is found, and the class name doesn't start with 363 * "x-", then it must be an arbitrary CSS class that has nothing to 364 * do with vCard 365 */ 366 if (!className.startsWith("x-")) { 367 return null; 368 } 369 370 scribe = new RawPropertyScribe(className); 371 } 372 373 return scribe; 374 } 375 376 private VCard parseEmbeddedVCard(Element element) { 377 embeddedVCards.add(element); 378 HCardParser embeddedReader = new HCardParser(element, pageUrl); 379 try { 380 return embeddedReader.readNext(); 381 } finally { 382 warnings.addAll(embeddedReader.getWarnings()); 383 IOUtils.closeQuietly(embeddedReader); 384 } 385 } 386 387 private void parseProperty(Element element, String className) { 388 if (urlPropertyName.equals(className)) { 389 VCardProperty impp = tryToParseAsImpp(element); 390 if (impp != null) { 391 vcard.addProperty(impp); 392 return; 393 } 394 } 395 396 VCardPropertyScribe<? extends VCardProperty> scribe = getPropertyScribe(className); 397 if (scribe == null) { 398 //it's a CSS class that's unrelated to hCard 399 return; 400 } 401 402 context.getWarnings().clear(); 403 context.setPropertyName(scribe.getPropertyName()); 404 405 VCardProperty property; 406 try { 407 property = scribe.parseHtml(new HCardElement(element), context); 408 } catch (SkipMeException e) { 409 //@formatter:off 410 warnings.add(new ParseWarning.Builder(context) 411 .message(22, e.getMessage()) 412 .build() 413 ); 414 //@formatter:on 415 416 return; 417 } catch (CannotParseException e) { 418 //@formatter:off 419 warnings.add(new ParseWarning.Builder(context) 420 .message(e) 421 .build() 422 ); 423 //@formatter:on 424 425 property = new RawProperty(className, element.outerHtml()); 426 vcard.addProperty(property); 427 return; 428 } catch (EmbeddedVCardException e) { 429 if (isChildOf(element, embeddedVCards)) { 430 //prevents multiple-nested embedded elements from overwriting each other 431 return; 432 } 433 434 property = e.getProperty(); 435 436 VCard embeddedVCard = parseEmbeddedVCard(element); 437 e.injectVCard(embeddedVCard); 438 vcard.addProperty(property); 439 return; 440 } 441 442 warnings.addAll(context.getWarnings()); 443 444 /* 445 * LABELs must be treated specially so they can be matched up with their 446 * ADRs. 447 */ 448 if (property instanceof Label) { 449 handleLabel((Label) property); 450 return; 451 } 452 453 /* 454 * Add all NICKNAMEs to the same type object. 455 */ 456 if (property instanceof Nickname) { 457 handleNickname((Nickname) property); 458 return; 459 } 460 461 /* 462 * Add all CATEGORIES to the same type object. 463 */ 464 if (property instanceof Categories) { 465 handleCategories((Categories) property); 466 return; 467 } 468 469 vcard.addProperty(property); 470 } 471 472 private void handleLabel(Label property) { 473 labels.add(property); 474 } 475 476 private void handleNickname(Nickname property) { 477 if (nickname == null) { 478 nickname = property; 479 vcard.addProperty(nickname); 480 } else { 481 nickname.getValues().addAll(property.getValues()); 482 } 483 } 484 485 private void handleCategories(Categories property) { 486 if (categories == null) { 487 categories = property; 488 vcard.addProperty(categories); 489 } else { 490 categories.getValues().addAll(property.getValues()); 491 } 492 } 493 494 public void close() { 495 //empty 496 } 497}