001 package ezvcard.io.html; 002 003 import java.io.File; 004 import java.io.IOException; 005 import java.io.InputStream; 006 import java.io.Reader; 007 import java.net.MalformedURLException; 008 import java.net.URL; 009 import java.util.ArrayList; 010 import java.util.Iterator; 011 import java.util.List; 012 import java.util.Set; 013 014 import org.jsoup.Jsoup; 015 import org.jsoup.nodes.Document; 016 import org.jsoup.nodes.Element; 017 import org.jsoup.select.Elements; 018 019 import ezvcard.Messages; 020 import ezvcard.VCard; 021 import ezvcard.VCardVersion; 022 import ezvcard.io.CannotParseException; 023 import ezvcard.io.EmbeddedVCardException; 024 import ezvcard.io.SkipMeException; 025 import ezvcard.io.scribe.RawPropertyScribe; 026 import ezvcard.io.scribe.ScribeIndex; 027 import ezvcard.io.scribe.VCardPropertyScribe; 028 import ezvcard.io.scribe.VCardPropertyScribe.Result; 029 import ezvcard.property.Address; 030 import ezvcard.property.Categories; 031 import ezvcard.property.Email; 032 import ezvcard.property.Impp; 033 import ezvcard.property.Label; 034 import ezvcard.property.Nickname; 035 import ezvcard.property.RawProperty; 036 import ezvcard.property.Source; 037 import ezvcard.property.Telephone; 038 import ezvcard.property.Url; 039 import ezvcard.property.VCardProperty; 040 import ezvcard.util.HtmlUtils; 041 042 /* 043 Copyright (c) 2013, Michael Angstadt 044 All rights reserved. 045 046 Redistribution and use in source and binary forms, with or without 047 modification, are permitted provided that the following conditions are met: 048 049 1. Redistributions of source code must retain the above copyright notice, this 050 list of conditions and the following disclaimer. 051 2. Redistributions in binary form must reproduce the above copyright notice, 052 this list of conditions and the following disclaimer in the documentation 053 and/or other materials provided with the distribution. 054 055 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 056 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 057 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 058 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 059 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 060 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 061 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 062 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 063 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 064 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 065 066 The views and conclusions contained in the software and documentation are those 067 of the authors and should not be interpreted as representing official policies, 068 either expressed or implied, of the FreeBSD Project. 069 */ 070 071 /** 072 * <p> 073 * Parses {@link VCard} objects from an HTML page (hCard format). 074 * </p> 075 * <p> 076 * <b>Example:</b> 077 * 078 * <pre class="brush:java"> 079 * URL url = new URL("http://example.com"); 080 * HCardReader hcardReader = new HCardReader(url); 081 * VCard vcard; 082 * while ((vcard = hcardReader.readNext()) != null){ 083 * ... 084 * } 085 * </pre> 086 * 087 * </p> 088 * @author Michael Angstadt 089 * @see <a 090 * href="http://microformats.org/wiki/hcard">http://microformats.org/wiki/hcard</a> 091 */ 092 public class HCardReader { 093 private ScribeIndex index = new ScribeIndex(); 094 private String pageUrl; 095 private final List<String> warnings = new ArrayList<String>(); 096 private Elements vcardElements; 097 private Iterator<Element> it; 098 private final List<Label> labels = new ArrayList<Label>(); 099 private VCard curVCard; 100 private Elements embeddedVCards = new Elements(); 101 private Nickname nickname; 102 private Categories categories; 103 104 private final String urlPropertyName = index.getPropertyScribe(Url.class).getPropertyName().toLowerCase(); 105 private final String categoriesName = index.getPropertyScribe(Categories.class).getPropertyName().toLowerCase(); 106 private final String emailName = index.getPropertyScribe(Email.class).getPropertyName().toLowerCase(); 107 private final String telName = index.getPropertyScribe(Telephone.class).getPropertyName().toLowerCase(); 108 109 /** 110 * Creates a reader that parses hCards from a URL. 111 * @param url the URL of the webpage 112 * @throws IOException if there's a problem loading the webpage 113 */ 114 public HCardReader(URL url) throws IOException { 115 pageUrl = url.toString(); 116 Document document = Jsoup.parse(url, 30000); 117 init(document, url.getRef()); 118 } 119 120 /** 121 * Creates a reader that parses hCards from an input stream. 122 * @param in the input stream to the HTML page 123 * @throws IOException if there's a problem reading the HTML page 124 */ 125 public HCardReader(InputStream in) throws IOException { 126 this(in, null); 127 } 128 129 /** 130 * Creates a reader that parses hCards from an input stream. 131 * @param in the input stream to the HTML page 132 * @param pageUrl the original URL of the HTML page 133 * @throws IOException if there's a problem reading the HTML page 134 */ 135 public HCardReader(InputStream in, String pageUrl) throws IOException { 136 this.pageUrl = pageUrl; 137 Document document = (pageUrl == null) ? Jsoup.parse(in, null, "") : Jsoup.parse(in, null, pageUrl); 138 String anchor = getAnchor(pageUrl); 139 init(document, anchor); 140 } 141 142 /** 143 * Creates a reader that parses hCards from a file. 144 * @param file the HTML file 145 * @throws IOException if there's a problem reading the HTML file 146 */ 147 public HCardReader(File file) throws IOException { 148 this(file, null); 149 } 150 151 /** 152 * Creates a reader that parses hCards from a file. 153 * @param file the HTML file 154 * @param pageUrl the original URL of the HTML page 155 * @throws IOException if there's a problem reading the HTML file 156 */ 157 public HCardReader(File file, String pageUrl) throws IOException { 158 this.pageUrl = pageUrl; 159 Document document = (pageUrl == null) ? Jsoup.parse(file, null, "") : Jsoup.parse(file, null, pageUrl); 160 String anchor = getAnchor(pageUrl); 161 init(document, anchor); 162 } 163 164 /** 165 * Creates a reader that parses hCards from a reader. 166 * @param reader the input stream to the HTML page 167 * @throws IOException if there's a problem reading the HTML page 168 */ 169 public HCardReader(Reader reader) throws IOException { 170 this(reader, null); 171 } 172 173 /** 174 * Creates a reader that parses hCards from a reader. 175 * @param reader the input stream to the HTML page 176 * @param pageUrl the original URL of the HTML page 177 * @throws IOException if there's a problem reading the HTML page 178 */ 179 public HCardReader(Reader reader, String pageUrl) throws IOException { 180 this.pageUrl = pageUrl; 181 182 StringBuilder sb = new StringBuilder(); 183 char buffer[] = new char[4096]; 184 int read; 185 while ((read = reader.read(buffer)) != -1) { 186 sb.append(buffer, 0, read); 187 } 188 String html = sb.toString(); 189 190 Document document = (pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl); 191 String anchor = getAnchor(pageUrl); 192 init(document, anchor); 193 } 194 195 /** 196 * Creates a reader that parses hCards from a string. 197 * @param html the HTML page 198 */ 199 public HCardReader(String html) { 200 this(html, null); 201 } 202 203 /** 204 * Creates a reader that parses hCards from a string. 205 * @param html the HTML page 206 * @param pageUrl the original URL of the HTML page 207 */ 208 public HCardReader(String html, String pageUrl) { 209 this.pageUrl = pageUrl; 210 211 Document document = (pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl); 212 String anchor = getAnchor(pageUrl); 213 init(document, anchor); 214 } 215 216 /** 217 * Constructor for reading embedded vCards. 218 * @param embeddedVCard the HTML element of the embedded vCard 219 * @param pageUrl the original URL of the HTML page 220 */ 221 private HCardReader(Element embeddedVCard, String pageUrl) { 222 this.pageUrl = pageUrl; 223 vcardElements = new Elements(embeddedVCard); 224 it = vcardElements.iterator(); 225 } 226 227 private void init(Document document, String anchor) { 228 Element searchIn = null; 229 if (anchor != null) { 230 searchIn = document.getElementById(anchor); 231 } 232 if (searchIn == null) { 233 searchIn = document; 234 } 235 236 vcardElements = searchIn.getElementsByClass("vcard"); 237 it = vcardElements.iterator(); 238 } 239 240 /** 241 * Gets the anchor part of a URL. 242 * @param urlStr the URL 243 * @return the anchor (e.g. "foo" from the URL 244 * "http://example.com/index.php#foo") 245 */ 246 private String getAnchor(String urlStr) { 247 if (urlStr == null) { 248 return null; 249 } 250 251 try { 252 URL url = new URL(urlStr); 253 return url.getRef(); 254 } catch (MalformedURLException e) { 255 return null; 256 } 257 } 258 259 /** 260 * <p> 261 * Registers a property scribe. This is the same as calling: 262 * </p> 263 * <p> 264 * {@code getScribeIndex().register(scribe)} 265 * </p> 266 * @param scribe the scribe to register 267 */ 268 public void registerScribe(VCardPropertyScribe<? extends VCardProperty> scribe) { 269 index.register(scribe); 270 } 271 272 /** 273 * Gets the scribe index. 274 * @return the scribe index 275 */ 276 public ScribeIndex getScribeIndex() { 277 return index; 278 } 279 280 /** 281 * Sets the scribe index. 282 * @param index the scribe index 283 */ 284 public void setScribeIndex(ScribeIndex index) { 285 this.index = index; 286 } 287 288 /** 289 * Gets the warnings from the last vCard that was unmarshalled. This list is 290 * reset every time a new vCard is read. 291 * @return the warnings or empty list if there were no warnings 292 */ 293 public List<String> getWarnings() { 294 return new ArrayList<String>(warnings); 295 } 296 297 /** 298 * Reads the next vCard from the data stream. 299 * @return the next vCard or null if there are no more 300 */ 301 public VCard readNext() { 302 Element vcardElement = null; 303 while (it.hasNext() && vcardElement == null) { 304 vcardElement = it.next(); 305 306 //if this element is a child of another "vcard" element, then ignore it because it's an embedded vcard 307 if (HtmlUtils.isChildOf(vcardElement, vcardElements)) { 308 vcardElement = null; 309 } 310 } 311 if (vcardElement == null) { 312 return null; 313 } 314 315 warnings.clear(); 316 labels.clear(); 317 nickname = null; 318 categories = null; 319 320 curVCard = new VCard(); 321 curVCard.setVersion(VCardVersion.V3_0); 322 if (pageUrl != null) { 323 curVCard.addSource(new Source(pageUrl)); 324 } 325 326 //visit all descendant nodes, depth-first 327 for (Element child : vcardElement.children()) { 328 visit(child); 329 } 330 331 //assign labels to their addresses 332 for (Label label : labels) { 333 boolean orphaned = true; 334 for (Address adr : curVCard.getAddresses()) { 335 if (adr.getLabel() == null && adr.getTypes().equals(label.getTypes())) { 336 adr.setLabel(label.getValue()); 337 orphaned = false; 338 break; 339 } 340 } 341 if (orphaned) { 342 curVCard.addOrphanedLabel(label); 343 } 344 } 345 346 return curVCard; 347 } 348 349 private void visit(Element element) { 350 Set<String> classNames = element.classNames(); 351 for (String className : classNames) { 352 className = className.toLowerCase(); 353 354 if (urlPropertyName.equalsIgnoreCase(className)) { 355 String href = element.attr("href"); 356 if (href.length() > 0) { 357 if (!classNames.contains(emailName) && href.matches("(?i)mailto:.*")) { 358 className = emailName; 359 } else if (!classNames.contains(telName) && href.matches("(?i)tel:.*")) { 360 className = telName; 361 } else { 362 //try parsing as IMPP 363 VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(Impp.class); 364 try { 365 Result<? extends VCardProperty> result = scribe.parseHtml(element); 366 curVCard.addProperty(result.getProperty()); 367 for (String warning : result.getWarnings()) { 368 addWarning(scribe.getPropertyName(), warning); 369 } 370 continue; 371 } catch (SkipMeException e) { 372 //URL is not an instant messenger URL 373 } catch (CannotParseException e) { 374 //URL is not an instant messenger URL 375 } 376 } 377 } 378 } 379 380 //hCard uses a different name for the CATEGORIES property 381 if ("category".equalsIgnoreCase(className)) { 382 className = categoriesName; 383 } 384 385 VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(className); 386 if (scribe == null) { 387 //if no scribe is found, and the class name doesn't start with "x-", then it must be an arbitrary CSS class that has nothing to do with vCard 388 if (!className.startsWith("x-")) { 389 continue; 390 } 391 scribe = new RawPropertyScribe(className); 392 } 393 394 VCardProperty property; 395 try { 396 Result<? extends VCardProperty> result = scribe.parseHtml(element); 397 398 for (String warning : result.getWarnings()) { 399 addWarning(className, warning); 400 } 401 402 property = result.getProperty(); 403 404 //LABELs must be treated specially so they can be matched up with their ADRs 405 if (property instanceof Label) { 406 labels.add((Label) property); 407 continue; 408 } 409 410 //add all NICKNAMEs to the same type object 411 if (property instanceof Nickname) { 412 Nickname nn = (Nickname) property; 413 if (nickname == null) { 414 nickname = nn; 415 curVCard.addProperty(nickname); 416 } else { 417 nickname.getValues().addAll(nn.getValues()); 418 } 419 continue; 420 } 421 422 //add all CATEGORIES to the same type object 423 if (property instanceof Categories) { 424 Categories c = (Categories) property; 425 if (categories == null) { 426 categories = c; 427 curVCard.addProperty(categories); 428 } else { 429 categories.getValues().addAll(c.getValues()); 430 } 431 continue; 432 } 433 } catch (SkipMeException e) { 434 addWarning(className, 22, e.getMessage()); 435 continue; 436 } catch (CannotParseException e) { 437 String html = element.outerHtml(); 438 addWarning(className, 32, html, e.getMessage()); 439 property = new RawProperty(className, html); 440 } catch (EmbeddedVCardException e) { 441 if (HtmlUtils.isChildOf(element, embeddedVCards)) { 442 //prevents multiple-nested embedded elements from overwriting each other 443 continue; 444 } 445 446 property = e.getProperty(); 447 448 embeddedVCards.add(element); 449 HCardReader embeddedReader = new HCardReader(element, pageUrl); 450 try { 451 VCard embeddedVCard = embeddedReader.readNext(); 452 e.injectVCard(embeddedVCard); 453 } finally { 454 for (String w : embeddedReader.getWarnings()) { 455 addWarning(className, 26, w); 456 } 457 } 458 } 459 460 curVCard.addProperty(property); 461 } 462 463 for (Element child : element.children()) { 464 visit(child); 465 } 466 } 467 468 private void addWarning(String propertyName, int code, Object... args) { 469 String message = Messages.INSTANCE.getParseMessage(code, args); 470 addWarning(propertyName, message); 471 } 472 473 private void addWarning(String propertyName, String message) { 474 String warning = Messages.INSTANCE.getParseMessage(35, propertyName, message); 475 warnings.add(warning); 476 } 477 }