001package ezvcard.io.html; 002 003import static ezvcard.util.HtmlUtils.isChildOf; 004 005import java.io.IOException; 006import java.io.InputStream; 007import java.io.Reader; 008import java.io.UncheckedIOException; 009import java.net.MalformedURLException; 010import java.net.URL; 011import java.nio.file.Path; 012import java.util.ArrayList; 013import java.util.Iterator; 014import java.util.List; 015import java.util.Set; 016 017import org.jsoup.Jsoup; 018import org.jsoup.nodes.Document; 019import org.jsoup.nodes.Element; 020import org.jsoup.select.Elements; 021 022import ezvcard.VCard; 023import ezvcard.VCardVersion; 024import ezvcard.io.CannotParseException; 025import ezvcard.io.EmbeddedVCardException; 026import ezvcard.io.ParseWarning; 027import ezvcard.io.SkipMeException; 028import ezvcard.io.StreamReader; 029import ezvcard.io.scribe.RawPropertyScribe; 030import ezvcard.io.scribe.VCardPropertyScribe; 031import ezvcard.property.Categories; 032import ezvcard.property.Email; 033import ezvcard.property.Impp; 034import ezvcard.property.Label; 035import ezvcard.property.Nickname; 036import ezvcard.property.RawProperty; 037import ezvcard.property.Telephone; 038import ezvcard.property.Url; 039import ezvcard.property.VCardProperty; 040import ezvcard.util.Gobble; 041import ezvcard.util.IOUtils; 042 043/* 044 Copyright (c) 2012-2023, Michael Angstadt 045 All rights reserved. 046 047 Redistribution and use in source and binary forms, with or without 048 modification, are permitted provided that the following conditions are met: 049 050 1. Redistributions of source code must retain the above copyright notice, this 051 list of conditions and the following disclaimer. 052 2. Redistributions in binary form must reproduce the above copyright notice, 053 this list of conditions and the following disclaimer in the documentation 054 and/or other materials provided with the distribution. 055 056 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 057 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 058 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 059 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 060 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 061 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 062 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 063 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 064 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 065 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 066 067 The views and conclusions contained in the software and documentation are those 068 of the authors and should not be interpreted as representing official policies, 069 either expressed or implied, of the FreeBSD Project. 070 */ 071 072/** 073 * <p> 074 * Parses {@link VCard} objects from an HTML page (hCard format). 075 * </p> 076 * <p> 077 * <b>Example:</b> 078 * </p> 079 * 080 * <pre class="brush:java"> 081 * URL url = new URL("http://example.com"); 082 * HCardParser parser = new HCardParser(url); 083 * List<VCard> vcards = parser.parseAll(); 084 * </pre> 085 * @author Michael Angstadt 086 * @see <a href="http://microformats.org/wiki/hcard">http://microformats.org/ 087 * wiki/hcard</a> 088 */ 089public class HCardParser extends StreamReader { 090 private final String pageUrl; 091 private final Elements vcardElements; 092 private final Iterator<Element> vcardElementsIt; 093 private final List<Label> labels = new ArrayList<>(); 094 095 private VCard vcard; 096 private Elements embeddedVCards = new Elements(); 097 private Nickname nickname; 098 private Categories categories; 099 100 private final String urlPropertyName = index.getPropertyScribe(Url.class).getPropertyName().toLowerCase(); 101 private final String categoriesName = index.getPropertyScribe(Categories.class).getPropertyName().toLowerCase(); 102 private final String emailName = index.getPropertyScribe(Email.class).getPropertyName().toLowerCase(); 103 private final String telName = index.getPropertyScribe(Telephone.class).getPropertyName().toLowerCase(); 104 105 /** 106 * Creates an hCard document. 107 * @param url the URL of the webpage 108 * @throws IOException if there's a problem loading the webpage 109 */ 110 public HCardParser(URL url) throws IOException { 111 this(Jsoup.parse(url, 30000), url.toString()); 112 } 113 114 /** 115 * Creates an hCard document. 116 * @param in the input stream to the HTML page 117 * @throws IOException if there's a problem reading the HTML page 118 */ 119 public HCardParser(InputStream in) throws IOException { 120 this(in, null); 121 } 122 123 /** 124 * Creates an hCard document. 125 * @param in the input stream to the HTML page 126 * @param pageUrl the original URL of the HTML page (used to resolve 127 * relative links) 128 * @throws IOException if there's a problem reading the HTML page 129 */ 130 public HCardParser(InputStream in, String pageUrl) throws IOException { 131 this((pageUrl == null) ? Jsoup.parse(in, null, "") : Jsoup.parse(in, null, pageUrl), pageUrl); 132 } 133 134 /** 135 * Creates an hCard document. 136 * @param file the HTML file 137 * @throws IOException if there's a problem reading the HTML file 138 */ 139 public HCardParser(Path file) throws IOException { 140 this(file, null); 141 } 142 143 /** 144 * Creates an hCard document. 145 * @param file the HTML file 146 * @param pageUrl the original URL of the HTML page (used to resolve 147 * relative links) 148 * @throws IOException if there's a problem reading the HTML file 149 */ 150 public HCardParser(Path file, String pageUrl) throws IOException { 151 this((pageUrl == null) ? Jsoup.parse(file.toFile(), null, "") : Jsoup.parse(file.toFile(), null, pageUrl), pageUrl); 152 } 153 154 /** 155 * Creates an hCard document. 156 * @param reader the input stream to the HTML page 157 * @throws IOException if there's a problem reading the HTML page 158 */ 159 public HCardParser(Reader reader) throws IOException { 160 this(reader, null); 161 } 162 163 /** 164 * Creates an hCard document. 165 * @param reader the input stream to the HTML page 166 * @param pageUrl the original URL of the HTML page (used to resolve 167 * relative links) 168 * @throws IOException if there's a problem reading the HTML page 169 */ 170 public HCardParser(Reader reader, String pageUrl) throws IOException { 171 this(new Gobble(reader).asString(), pageUrl); 172 } 173 174 /** 175 * Creates an hCard document. 176 * @param html the HTML page 177 */ 178 public HCardParser(String html) { 179 this(html, null); 180 } 181 182 /** 183 * Creates an hCard document. 184 * @param html the HTML page 185 * @param pageUrl the original URL of the HTML page (used to resolve 186 * relative links) 187 */ 188 public HCardParser(String html, String pageUrl) { 189 this((pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl), pageUrl); 190 } 191 192 /** 193 * Creates an hCard document. 194 * @param document the HTML page 195 */ 196 public HCardParser(Document document) { 197 this(document, null); 198 } 199 200 /** 201 * Creates an hCard document. 202 * @param document the HTML page 203 * @param pageUrl the original URL of the HTML page (used to resolve 204 * relative links) 205 */ 206 public HCardParser(Document document, String pageUrl) { 207 this.pageUrl = pageUrl; 208 209 String anchor = null; 210 if (pageUrl != null) { 211 try { 212 URL url = new URL(pageUrl); 213 anchor = url.getRef(); 214 } catch (MalformedURLException e) { 215 anchor = null; 216 } 217 } 218 219 Element searchUnder = null; 220 if (anchor != null) { 221 searchUnder = document.getElementById(anchor); 222 } 223 if (searchUnder == null) { 224 searchUnder = document; 225 } 226 227 vcardElements = searchUnder.getElementsByClass("vcard"); 228 229 //remove nested vcard elements 230 Iterator<Element> it = vcardElements.iterator(); 231 while (it.hasNext()) { 232 Element element = it.next(); 233 if (isChildOf(element, vcardElements)) { 234 it.remove(); 235 } 236 } 237 238 vcardElementsIt = vcardElements.iterator(); 239 } 240 241 /** 242 * Constructor for reading embedded vCards. 243 * @param embeddedVCard the HTML element of the embedded vCard 244 * @param pageUrl the original URL of the HTML page 245 */ 246 private HCardParser(Element embeddedVCard, String pageUrl) { 247 this.pageUrl = pageUrl; 248 vcardElements = new Elements(embeddedVCard); 249 vcardElementsIt = vcardElements.iterator(); 250 } 251 252 @Override 253 public VCard readNext() { 254 try { 255 return super.readNext(); 256 } catch (IOException e) { 257 //will not be thrown because reading from DOM 258 throw new UncheckedIOException(e); 259 } 260 } 261 262 @Override 263 protected VCard _readNext() { 264 if (!vcardElementsIt.hasNext()) { 265 return null; 266 } 267 268 context.setVersion(VCardVersion.V3_0); 269 parseVCardElement(vcardElementsIt.next()); 270 return vcard; 271 } 272 273 private void parseVCardElement(Element vcardElement) { 274 labels.clear(); 275 nickname = null; 276 categories = null; 277 278 vcard = new VCard(); 279 vcard.setVersion(VCardVersion.V3_0); 280 if (pageUrl != null) { 281 vcard.addSource(pageUrl); 282 } 283 284 //visit all descendant nodes, depth-first 285 for (Element child : vcardElement.children()) { 286 visit(child); 287 } 288 289 //assign labels to their addresses 290 assignLabels(vcard, labels); 291 } 292 293 private void visit(Element element) { 294 boolean visitChildren = true; 295 Set<String> classNames = element.classNames(); 296 for (String className : classNames) { 297 className = className.toLowerCase(); 298 299 //give special treatment to certain URLs 300 if (urlPropertyName.equals(className)) { 301 String href = element.attr("href"); 302 if (href.length() > 0) { 303 if (!classNames.contains(emailName) && href.matches("(?i)mailto:.*")) { 304 className = emailName; 305 } else if (!classNames.contains(telName) && href.matches("(?i)tel:.*")) { 306 className = telName; 307 } else { 308 //try parsing as IMPP 309 VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(Impp.class); 310 311 context.getWarnings().clear(); 312 context.setPropertyName(scribe.getPropertyName()); 313 try { 314 VCardProperty property = scribe.parseHtml(new HCardElement(element), context); 315 vcard.addProperty(property); 316 warnings.addAll(context.getWarnings()); 317 continue; 318 } catch (SkipMeException e) { 319 //URL is not an instant messenger URL 320 } catch (CannotParseException e) { 321 //URL is not an instant messenger URL 322 } 323 } 324 } 325 } 326 327 //hCard uses a different name for the CATEGORIES property 328 if ("category".equals(className)) { 329 className = categoriesName; 330 } 331 332 VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(className); 333 if (scribe == null) { 334 //if no scribe is found, and the class name doesn't start with "x-", then it must be an arbitrary CSS class that has nothing to do with vCard 335 if (!className.startsWith("x-")) { 336 continue; 337 } 338 scribe = new RawPropertyScribe(className); 339 } 340 341 context.getWarnings().clear(); 342 context.setPropertyName(scribe.getPropertyName()); 343 344 VCardProperty property; 345 try { 346 property = scribe.parseHtml(new HCardElement(element), context); 347 warnings.addAll(context.getWarnings()); 348 349 //LABELs must be treated specially so they can be matched up with their ADRs 350 if (property instanceof Label) { 351 labels.add((Label) property); 352 continue; 353 } 354 355 //add all NICKNAMEs to the same type object 356 if (property instanceof Nickname) { 357 Nickname nn = (Nickname) property; 358 if (nickname == null) { 359 nickname = nn; 360 vcard.addProperty(nickname); 361 } else { 362 nickname.getValues().addAll(nn.getValues()); 363 } 364 continue; 365 } 366 367 //add all CATEGORIES to the same type object 368 if (property instanceof Categories) { 369 Categories c = (Categories) property; 370 if (categories == null) { 371 categories = c; 372 vcard.addProperty(categories); 373 } else { 374 categories.getValues().addAll(c.getValues()); 375 } 376 continue; 377 } 378 } catch (SkipMeException e) { 379 //@formatter:off 380 warnings.add(new ParseWarning.Builder(context) 381 .message(22, e.getMessage()) 382 .build() 383 ); 384 //@formatter:on 385 continue; 386 } catch (CannotParseException e) { 387 //@formatter:off 388 warnings.add(new ParseWarning.Builder(context) 389 .message(e) 390 .build() 391 ); 392 //@formatter:on 393 394 property = new RawProperty(className, element.outerHtml()); 395 } catch (EmbeddedVCardException e) { 396 if (isChildOf(element, embeddedVCards)) { 397 //prevents multiple-nested embedded elements from overwriting each other 398 continue; 399 } 400 401 property = e.getProperty(); 402 403 embeddedVCards.add(element); 404 HCardParser embeddedReader = new HCardParser(element, pageUrl); 405 try { 406 VCard embeddedVCard = embeddedReader.readNext(); 407 e.injectVCard(embeddedVCard); 408 } finally { 409 warnings.addAll(embeddedReader.getWarnings()); 410 IOUtils.closeQuietly(embeddedReader); 411 } 412 visitChildren = false; 413 } 414 415 vcard.addProperty(property); 416 } 417 418 if (visitChildren) { 419 for (Element child : element.children()) { 420 visit(child); 421 } 422 } 423 } 424 425 public void close() { 426 //empty 427 } 428}