001 package ezvcard.io;
002
003 import java.io.File;
004 import java.io.IOException;
005 import java.io.InputStream;
006 import java.io.Reader;
007 import java.lang.reflect.Method;
008 import java.net.MalformedURLException;
009 import java.net.URL;
010 import java.util.ArrayList;
011 import java.util.HashMap;
012 import java.util.Iterator;
013 import java.util.List;
014 import java.util.Map;
015 import java.util.Set;
016
017 import org.jsoup.Jsoup;
018 import org.jsoup.nodes.Document;
019 import org.jsoup.nodes.Element;
020 import org.jsoup.select.Elements;
021
022 import ezvcard.VCard;
023 import ezvcard.VCardVersion;
024 import ezvcard.types.AddressType;
025 import ezvcard.types.CategoriesType;
026 import ezvcard.types.EmailType;
027 import ezvcard.types.ImppType;
028 import ezvcard.types.LabelType;
029 import ezvcard.types.NicknameType;
030 import ezvcard.types.RawType;
031 import ezvcard.types.SourceType;
032 import ezvcard.types.TelephoneType;
033 import ezvcard.types.TypeList;
034 import ezvcard.types.UrlType;
035 import ezvcard.types.VCardType;
036 import ezvcard.util.HtmlUtils;
037
038 /*
039 Copyright (c) 2012, Michael Angstadt
040 All rights reserved.
041
042 Redistribution and use in source and binary forms, with or without
043 modification, are permitted provided that the following conditions are met:
044
045 1. Redistributions of source code must retain the above copyright notice, this
046 list of conditions and the following disclaimer.
047 2. Redistributions in binary form must reproduce the above copyright notice,
048 this list of conditions and the following disclaimer in the documentation
049 and/or other materials provided with the distribution.
050
051 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
052 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
053 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
054 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
055 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
056 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
057 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
058 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
059 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
060 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
061
062 The views and conclusions contained in the software and documentation are those
063 of the authors and should not be interpreted as representing official policies,
064 either expressed or implied, of the FreeBSD Project.
065 */
066
067 /**
068 * Reads vCards encoded in HTML (hCard format).
069 * @author Michael Angstadt
070 * @see <a
071 * href="http://microformats.org/wiki/hcard">http://microformats.org/wiki/hcard</a>
072 */
073 public class HCardReader implements IParser {
074 protected String pageUrl;
075 protected List<String> warnings = new ArrayList<String>();
076 protected Map<String, Class<? extends VCardType>> extendedTypeClasses = new HashMap<String, Class<? extends VCardType>>();
077 protected Elements vcardElements;
078 protected Iterator<Element> it;
079 protected List<LabelType> labels = new ArrayList<LabelType>();
080 protected List<String> warningsBuffer = new ArrayList<String>();
081 protected VCard curVCard;
082 protected Elements embeddedVCards = new Elements();
083 protected NicknameType nickname;
084 protected CategoriesType categories;
085
086 /**
087 * @param url the URL of the webpage
088 * @throws IOException if there's a problem loading the webpage
089 */
090 public HCardReader(URL url) throws IOException {
091 pageUrl = url.toString();
092 Document document = Jsoup.parse(url, 30000);
093 init(document, url.getRef());
094 }
095
096 /**
097 * @param in the input stream to the HTML page
098 * @throws IOException if there's a problem reading the HTML page
099 */
100 public HCardReader(InputStream in) throws IOException {
101 this(in, null);
102 }
103
104 /**
105 * @param in the input stream to the HTML page
106 * @param pageUrl the original URL of the HTML page
107 * @throws IOException if there's a problem reading the HTML page
108 */
109 public HCardReader(InputStream in, String pageUrl) throws IOException {
110 this.pageUrl = pageUrl;
111 Document document = (pageUrl == null) ? Jsoup.parse(in, null, "") : Jsoup.parse(in, null, pageUrl);
112 String anchor = getAnchor(pageUrl);
113 init(document, anchor);
114 }
115
116 /**
117 * @param file the HTML file
118 * @throws IOException if there's a problem reading the HTML file
119 */
120 public HCardReader(File file) throws IOException {
121 this(file, null);
122 }
123
124 /**
125 * @param file the HTML file
126 * @param pageUrl the original URL of the HTML page
127 * @throws IOException if there's a problem reading the HTML file
128 */
129 public HCardReader(File file, String pageUrl) throws IOException {
130 this.pageUrl = pageUrl;
131 Document document = (pageUrl == null) ? Jsoup.parse(file, null, "") : Jsoup.parse(file, null, pageUrl);
132 String anchor = getAnchor(pageUrl);
133 init(document, anchor);
134 }
135
136 /**
137 * @param reader the input stream to the HTML page
138 * @throws IOException if there's a problem reading the HTML page
139 */
140 public HCardReader(Reader reader) throws IOException {
141 this(reader, null);
142 }
143
144 /**
145 * @param reader the input stream to the HTML page
146 * @param pageUrl the original URL of the HTML page
147 * @throws IOException if there's a problem reading the HTML page
148 */
149 public HCardReader(Reader reader, String pageUrl) throws IOException {
150 this.pageUrl = pageUrl;
151
152 StringBuilder sb = new StringBuilder();
153 char buffer[] = new char[4096];
154 int read;
155 while ((read = reader.read(buffer)) != -1) {
156 sb.append(buffer, 0, read);
157 }
158 String html = sb.toString();
159
160 Document document = (pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl);
161 String anchor = getAnchor(pageUrl);
162 init(document, anchor);
163 }
164
165 /**
166 * @param html the HTML page
167 */
168 public HCardReader(String html) {
169 this(html, null);
170 }
171
172 /**
173 * @param html the HTML page
174 * @param pageUrl the original URL of the HTML page
175 */
176 public HCardReader(String html, String pageUrl) {
177 this.pageUrl = pageUrl;
178
179 Document document = (pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl);
180 String anchor = getAnchor(pageUrl);
181 init(document, anchor);
182 }
183
184 /**
185 * Constructor for reading embedded vCards.
186 * @param embeddedVCard the HTML element of the embedded vCard
187 * @param pageUrl the original URL of the HTML page
188 */
189 private HCardReader(Element embeddedVCard, String pageUrl) {
190 this.pageUrl = pageUrl;
191 vcardElements = new Elements(embeddedVCard);
192 it = vcardElements.iterator();
193 }
194
195 private void init(Document document, String anchor) {
196 Element searchIn = null;
197 if (anchor != null) {
198 searchIn = document.getElementById(anchor);
199 }
200 if (searchIn == null) {
201 searchIn = document;
202 }
203
204 vcardElements = searchIn.getElementsByClass("vcard");
205 it = vcardElements.iterator();
206 }
207
208 /**
209 * Gets the anchor part of a URL.
210 * @param urlStr the URL
211 * @return the anchor (e.g. "foo" from the URL
212 * "http://example.com/index.php#foo")
213 */
214 private String getAnchor(String urlStr) {
215 if (urlStr == null) {
216 return null;
217 }
218
219 try {
220 URL url = new URL(urlStr);
221 return url.getRef();
222 } catch (MalformedURLException e) {
223 return null;
224 }
225 }
226
227 //@Override
228 public void registerExtendedType(Class<? extends VCardType> clazz) {
229 extendedTypeClasses.put(getTypeNameFromTypeClass(clazz), clazz);
230 }
231
232 //@Override
233 public void unregisterExtendedType(Class<? extends VCardType> clazz) {
234 extendedTypeClasses.remove(getTypeNameFromTypeClass(clazz));
235 }
236
237 //@Override
238 public List<String> getWarnings() {
239 return new ArrayList<String>(warnings);
240 }
241
242 //@Override
243 public VCard readNext() {
244 Element vcardElement = null;
245 while (it.hasNext() && vcardElement == null) {
246 vcardElement = it.next();
247
248 //if this element is a child of another "vcard" element, then ignore it because it's an embedded vcard
249 if (HtmlUtils.isChildOf(vcardElement, vcardElements)) {
250 vcardElement = null;
251 }
252 }
253 if (vcardElement == null) {
254 return null;
255 }
256
257 warnings.clear();
258 warningsBuffer.clear();
259 labels.clear();
260 nickname = null;
261 categories = null;
262
263 curVCard = new VCard();
264 curVCard.setVersion(VCardVersion.V3_0);
265 if (pageUrl != null) {
266 curVCard.addSource(new SourceType(pageUrl));
267 }
268
269 //visit all descendant nodes, depth-first
270 for (Element child : vcardElement.children()) {
271 visit(child);
272 }
273
274 //assign labels to their addresses
275 for (LabelType label : labels) {
276 boolean orphaned = true;
277 for (AddressType adr : curVCard.getAddresses()) {
278 if (adr.getLabel() == null && adr.getTypes().equals(label.getTypes())) {
279 adr.setLabel(label.getValue());
280 orphaned = false;
281 break;
282 }
283 }
284 if (orphaned) {
285 curVCard.addOrphanedLabel(label);
286 }
287 }
288
289 return curVCard;
290 }
291
292 private void visit(Element element) {
293 Set<String> classNames = element.classNames();
294 for (String className : classNames) {
295 if (UrlType.NAME.equalsIgnoreCase(className)) {
296 String href = element.attr("href");
297 if (href.length() > 0) {
298 if (!classNames.contains(EmailType.NAME.toLowerCase()) && href.matches("(?i)mailto:.*")) {
299 className = EmailType.NAME;
300 } else if (!classNames.contains(TelephoneType.NAME.toLowerCase()) && href.matches("(?i)tel:.*")) {
301 className = TelephoneType.NAME;
302 } else {
303 //try parsing as IMPP
304 warningsBuffer.clear();
305 ImppType impp = new ImppType();
306 try {
307 impp.unmarshalHtml(element, warningsBuffer);
308 addToVCard(impp, curVCard);
309 warnings.addAll(warningsBuffer);
310 continue;
311 } catch (SkipMeException e) {
312 //URL is not an instant messenger URL
313 }
314 }
315 }
316 }
317
318 VCardType type = createTypeObject(className);
319 if (type == null) {
320 //if no type class is found, then it must be an arbitrary CSS class that has nothing to do with vCard
321 continue;
322 }
323
324 warningsBuffer.clear();
325 try {
326 type.unmarshalHtml(element, warningsBuffer);
327
328 //add to vcard
329 if (type instanceof LabelType) {
330 //LABELs must be treated specially so they can be matched up with their ADRs
331 labels.add((LabelType) type);
332 } else if (type instanceof NicknameType) {
333 //add all NICKNAMEs to the same type object
334 NicknameType nn = (NicknameType) type;
335 if (nickname == null) {
336 nickname = nn;
337 addToVCard(nickname, curVCard);
338 } else {
339 nickname.getValues().addAll(nn.getValues());
340 }
341 } else if (type instanceof CategoriesType) {
342 //add all CATEGORIES to the same type object
343 CategoriesType c = (CategoriesType) type;
344 if (categories == null) {
345 categories = c;
346 addToVCard(categories, curVCard);
347 } else {
348 categories.getValues().addAll(c.getValues());
349 }
350 } else {
351 addToVCard(type, curVCard);
352 }
353 } catch (SkipMeException e) {
354 warningsBuffer.add(type.getTypeName() + " property will not be unmarshalled: " + e.getMessage());
355 } catch (EmbeddedVCardException e) {
356 if (HtmlUtils.isChildOf(element, embeddedVCards)) {
357 //prevents multiple-nested embedded elements from overwriting each other
358 continue;
359 }
360
361 embeddedVCards.add(element);
362 HCardReader embeddedReader = new HCardReader(element, pageUrl);
363 try {
364 VCard embeddedVCard = embeddedReader.readNext();
365 e.injectVCard(embeddedVCard);
366 } finally {
367 for (String w : embeddedReader.getWarnings()) {
368 warnings.add("Problem unmarshalling nested vCard value from " + type.getTypeName() + ": " + w);
369 }
370 }
371 addToVCard(type, curVCard);
372 } catch (UnsupportedOperationException e) {
373 //type class does not support hCard
374 warningsBuffer.add("Type class \"" + type.getClass().getName() + "\" does not support hCard unmarshalling.");
375 } finally {
376 warnings.addAll(warningsBuffer);
377 }
378 }
379
380 for (Element child : element.children()) {
381 visit(child);
382 }
383 }
384
385 /**
386 * Creates the appropriate {@link VCardType} instance, given the type name.
387 * This method does not unmarshal the type, it just creates the type object.
388 * @param typeName the type name (e.g. "fn")
389 * @return the type object or null if the type name was not recognized
390 */
391 private VCardType createTypeObject(String typeName) {
392 typeName = typeName.toLowerCase();
393 VCardType t = null;
394 Class<? extends VCardType> clazz = TypeList.getTypeClassByHCardTypeName(typeName);
395 if (clazz != null) {
396 try {
397 //create a new instance of the class
398 t = clazz.newInstance();
399 } catch (Exception e) {
400 //it is the responsibility of the EZ-vCard developer to ensure that this exception is never thrown
401 //all type classes defined in the EZ-vCard library MUST have public, no-arg constructors
402 throw new RuntimeException(e);
403 }
404 } else {
405 Class<? extends VCardType> extendedTypeClass = extendedTypeClasses.get(typeName);
406 if (extendedTypeClass != null) {
407 try {
408 t = extendedTypeClass.newInstance();
409 } catch (Exception e) {
410 //this should never happen because the type class is checked to see if it has a public, no-arg constructor in the "registerExtendedType" method
411 throw new RuntimeException("Extended type class \"" + extendedTypeClass.getName() + "\" must have a public, no-arg constructor.");
412 }
413 } else if (typeName.startsWith("x-")) {
414 t = new RawType(typeName); //use RawType instead of TextType because we don't want to unescape any characters that might be meaningful to this type
415 }
416 }
417 return t;
418 }
419
420 /**
421 * Adds a type object to the vCard.
422 * @param t the type object
423 * @param vcard the vCard
424 */
425 private void addToVCard(VCardType t, VCard vcard) {
426 Method method = TypeList.getAddMethod(t.getClass());
427 if (method != null) {
428 try {
429 method.invoke(vcard, t);
430 } catch (Exception e) {
431 //this should NEVER be thrown because the method MUST be public
432 throw new RuntimeException(e);
433 }
434 } else {
435 vcard.addExtendedType(t);
436 }
437 }
438
439 /**
440 * Gets the type name from a type class.
441 * @param clazz the type class
442 * @return the type name
443 */
444 private String getTypeNameFromTypeClass(Class<? extends VCardType> clazz) {
445 try {
446 VCardType t = clazz.newInstance();
447 return t.getTypeName().toLowerCase();
448 } catch (Exception e) {
449 //there is no public, no-arg constructor
450 throw new RuntimeException(e);
451 }
452 }
453 }