001package ezvcard.io.html;
002
003import static ezvcard.util.HtmlUtils.isChildOf;
004
005import java.io.IOException;
006import java.io.InputStream;
007import java.io.Reader;
008import java.io.UncheckedIOException;
009import java.net.URL;
010import java.nio.file.Path;
011import java.time.Duration;
012import java.util.ArrayList;
013import java.util.Iterator;
014import java.util.List;
015import java.util.Set;
016import java.util.stream.Collectors;
017
018import org.jsoup.Jsoup;
019import org.jsoup.nodes.Document;
020import org.jsoup.nodes.Element;
021import org.jsoup.select.Elements;
022
023import ezvcard.VCard;
024import ezvcard.VCardVersion;
025import ezvcard.io.CannotParseException;
026import ezvcard.io.EmbeddedVCardException;
027import ezvcard.io.ParseWarning;
028import ezvcard.io.SkipMeException;
029import ezvcard.io.StreamReader;
030import ezvcard.io.scribe.RawPropertyScribe;
031import ezvcard.io.scribe.VCardPropertyScribe;
032import ezvcard.property.Categories;
033import ezvcard.property.Email;
034import ezvcard.property.Impp;
035import ezvcard.property.Label;
036import ezvcard.property.Nickname;
037import ezvcard.property.RawProperty;
038import ezvcard.property.Telephone;
039import ezvcard.property.Url;
040import ezvcard.property.VCardProperty;
041import ezvcard.util.Gobble;
042import ezvcard.util.HtmlUtils;
043import ezvcard.util.IOUtils;
044
045/*
046 Copyright (c) 2012-2026, Michael Angstadt
047 All rights reserved.
048
049 Redistribution and use in source and binary forms, with or without
050 modification, are permitted provided that the following conditions are met: 
051
052 1. Redistributions of source code must retain the above copyright notice, this
053 list of conditions and the following disclaimer. 
054 2. Redistributions in binary form must reproduce the above copyright notice,
055 this list of conditions and the following disclaimer in the documentation
056 and/or other materials provided with the distribution. 
057
058 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
059 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
060 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
061 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
062 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
063 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
064 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
065 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
066 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
067 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
068
069 The views and conclusions contained in the software and documentation are those
070 of the authors and should not be interpreted as representing official policies, 
071 either expressed or implied, of the FreeBSD Project.
072 */
073
074/**
075 * <p>
076 * Parses {@link VCard} objects from an HTML page (hCard format).
077 * </p>
078 * <p>
079 * <b>Example:</b>
080 * </p>
081 * 
082 * <pre class="brush:java">
083 * URL url = new URL("http://example.com");
084 * HCardParser parser = new HCardParser(url);
085 * List&lt;VCard&gt; vcards = parser.parseAll();
086 * </pre>
087 * @author Michael Angstadt
088 * @see <a href="http://microformats.org/wiki/hcard">http://microformats.org/
089 * wiki/hcard</a>
090 */
091public class HCardParser extends StreamReader {
092        private static final Duration urlTimeout = Duration.ofSeconds(30);
093
094        private final String pageUrl;
095        private final Iterator<Element> vcardElementsIt;
096        private final List<Label> labels = new ArrayList<>();
097
098        private VCard vcard;
099        private Elements embeddedVCards = new Elements();
100        private Nickname nickname;
101        private Categories categories;
102
103        private final String urlPropertyName = index.getPropertyScribe(Url.class).getPropertyName().toLowerCase();
104        private final String categoriesName = index.getPropertyScribe(Categories.class).getPropertyName().toLowerCase();
105        private final String emailName = index.getPropertyScribe(Email.class).getPropertyName().toLowerCase();
106        private final String telName = index.getPropertyScribe(Telephone.class).getPropertyName().toLowerCase();
107
108        /**
109         * Creates an hCard document.
110         * @param url the URL of the webpage
111         * @throws IOException if there's a problem loading the webpage
112         */
113        public HCardParser(URL url) throws IOException {
114                this(Jsoup.parse(url, (int) urlTimeout.toMillis()), url.toString());
115        }
116
117        /**
118         * Creates an hCard document.
119         * @param in the input stream to the HTML page
120         * @throws IOException if there's a problem reading the HTML page
121         */
122        public HCardParser(InputStream in) throws IOException {
123                this(in, null);
124        }
125
126        /**
127         * Creates an hCard document.
128         * @param in the input stream to the HTML page
129         * @param pageUrl the original URL of the HTML page (used to resolve
130         * relative links)
131         * @throws IOException if there's a problem reading the HTML page
132         */
133        public HCardParser(InputStream in, String pageUrl) throws IOException {
134                this((pageUrl == null) ? Jsoup.parse(in, null, "") : Jsoup.parse(in, null, pageUrl), pageUrl);
135        }
136
137        /**
138         * Creates an hCard document.
139         * @param file the HTML file
140         * @throws IOException if there's a problem reading the HTML file
141         */
142        public HCardParser(Path file) throws IOException {
143                this(file, null);
144        }
145
146        /**
147         * Creates an hCard document.
148         * @param file the HTML file
149         * @param pageUrl the original URL of the HTML page (used to resolve
150         * relative links)
151         * @throws IOException if there's a problem reading the HTML file
152         */
153        public HCardParser(Path file, String pageUrl) throws IOException {
154                this((pageUrl == null) ? Jsoup.parse(file.toFile(), null, "") : Jsoup.parse(file.toFile(), null, pageUrl), pageUrl);
155        }
156
157        /**
158         * Creates an hCard document.
159         * @param reader the input stream to the HTML page
160         * @throws IOException if there's a problem reading the HTML page
161         */
162        public HCardParser(Reader reader) throws IOException {
163                this(reader, null);
164        }
165
166        /**
167         * Creates an hCard document.
168         * @param reader the input stream to the HTML page
169         * @param pageUrl the original URL of the HTML page (used to resolve
170         * relative links)
171         * @throws IOException if there's a problem reading the HTML page
172         */
173        public HCardParser(Reader reader, String pageUrl) throws IOException {
174                this(new Gobble(reader).asString(), pageUrl);
175        }
176
177        /**
178         * Creates an hCard document.
179         * @param html the HTML page
180         */
181        public HCardParser(String html) {
182                this(html, null);
183        }
184
185        /**
186         * Creates an hCard document.
187         * @param html the HTML page
188         * @param pageUrl the original URL of the HTML page (used to resolve
189         * relative links)
190         */
191        public HCardParser(String html, String pageUrl) {
192                this((pageUrl == null) ? Jsoup.parse(html) : Jsoup.parse(html, pageUrl), pageUrl);
193        }
194
195        /**
196         * Creates an hCard document.
197         * @param document the HTML page
198         */
199        public HCardParser(Document document) {
200                this(document, null);
201        }
202
203        /**
204         * Creates an hCard document.
205         * @param document the HTML page
206         * @param pageUrl the original URL of the HTML page (used to resolve
207         * relative links)
208         */
209        public HCardParser(Document document, String pageUrl) {
210                this.pageUrl = pageUrl;
211
212                String anchor = (pageUrl == null) ? null : HtmlUtils.getAnchorFromUrl(pageUrl);
213
214                Element searchUnder = (anchor == null) ? null : document.getElementById(anchor);
215                if (searchUnder == null) {
216                        searchUnder = document;
217                }
218
219                /*
220                 * Nested vCards also show up in this list as separate list items. For
221                 * example, if the HTML document has one vCard and that vCard has one
222                 * nested vCard (i.e. AGENT property), this list will have two elements.
223                 * 
224                 * Exclude the nested vCards from being processed as their own,
225                 * independent vCards.
226                 */
227                Elements vcardElementsIncludingNested = searchUnder.getElementsByClass("vcard");
228
229                //@formatter:off
230                List<Element> vcardElementsWithoutNested = vcardElementsIncludingNested.stream()
231                        .filter(element -> !isChildOf(element, vcardElementsIncludingNested))
232                .collect(Collectors.toList());
233                //@formatter:on
234
235                vcardElementsIt = vcardElementsWithoutNested.iterator();
236        }
237
238        /**
239         * Constructor for reading embedded vCards.
240         * @param embeddedVCard the HTML element of the embedded vCard
241         * @param pageUrl the original URL of the HTML page
242         */
243        private HCardParser(Element embeddedVCard, String pageUrl) {
244                this.pageUrl = pageUrl;
245                vcardElementsIt = new Elements(embeddedVCard).iterator();
246        }
247
248        @Override
249        public VCard readNext() {
250                try {
251                        return super.readNext();
252                } catch (IOException e) {
253                        //will not be thrown because reading from DOM
254                        throw new UncheckedIOException(e);
255                }
256        }
257
258        @Override
259        protected VCard _readNext() {
260                if (!vcardElementsIt.hasNext()) {
261                        return null;
262                }
263
264                context.setVersion(VCardVersion.V3_0);
265                parseVCardElement(vcardElementsIt.next());
266                return vcard;
267        }
268
269        private void parseVCardElement(Element vcardElement) {
270                labels.clear();
271                nickname = null;
272                categories = null;
273
274                vcard = new VCard();
275                vcard.setVersion(VCardVersion.V3_0);
276                if (pageUrl != null) {
277                        vcard.addSource(pageUrl);
278                }
279
280                //visit all descendant nodes, depth-first
281                vcardElement.children().forEach(this::visit);
282
283                //assign labels to their addresses
284                assignLabels(vcard, labels);
285        }
286
287        private void visit(Element element) {
288                int embeddedVCardCount = embeddedVCards.size();
289
290                Set<String> classNames = adjustClassNames(element);
291
292                classNames.forEach(className -> parseProperty(element, className));
293
294                boolean noEmbeddedVCardsWereAdded = (embeddedVCardCount == embeddedVCards.size());
295                if (noEmbeddedVCardsWereAdded) {
296                        //do not visit children if there are any embedded vCards
297                        element.children().forEach(this::visit);
298                }
299        }
300
301        private Set<String> adjustClassNames(Element element) {
302                //@formatter:off
303                Set<String> classNamesToLower = element.classNames().stream()
304                        .map(String::toLowerCase)
305                .collect(Collectors.toSet());
306
307                return classNamesToLower.stream()
308                        .map(className -> adjustClassName(className, element, classNamesToLower))
309                .collect(Collectors.toSet());
310                //@formatter:on
311        }
312
313        private String adjustClassName(String className, Element element, Set<String> origClassNames) {
314                /*
315                 * hCard uses a different name for the CATEGORIES property.
316                 */
317                if ("category".equals(className)) {
318                        return categoriesName;
319                }
320
321                /*
322                 * Give special treatment to certain URLs.
323                 */
324                if (urlPropertyName.equals(className)) {
325                        String href = element.attr("href");
326                        if (!origClassNames.contains(emailName) && href.matches("(?i)mailto:.*")) {
327                                return emailName;
328                        }
329                        if (!origClassNames.contains(telName) && href.matches("(?i)tel:.*")) {
330                                return telName;
331                        }
332                }
333
334                return className;
335        }
336
337        private VCardProperty tryToParseAsImpp(Element element) {
338                String href = element.attr("href");
339                if (href.isEmpty()) {
340                        return null;
341                }
342
343                VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(Impp.class);
344
345                context.getWarnings().clear();
346                context.setPropertyName(scribe.getPropertyName());
347                try {
348                        VCardProperty property = scribe.parseHtml(new HCardElement(element), context);
349                        warnings.addAll(context.getWarnings());
350                        return property;
351                } catch (SkipMeException | CannotParseException e) {
352                        //URL is not an instant messenger URL
353                        return null;
354                }
355        }
356
357        private VCardPropertyScribe<? extends VCardProperty> getPropertyScribe(String className) {
358                VCardPropertyScribe<? extends VCardProperty> scribe = index.getPropertyScribe(className);
359
360                if (scribe == null) {
361                        /*
362                         * If no scribe is found, and the class name doesn't start with
363                         * "x-", then it must be an arbitrary CSS class that has nothing to
364                         * do with vCard
365                         */
366                        if (!className.startsWith("x-")) {
367                                return null;
368                        }
369
370                        scribe = new RawPropertyScribe(className);
371                }
372
373                return scribe;
374        }
375
376        private VCard parseEmbeddedVCard(Element element) {
377                embeddedVCards.add(element);
378                HCardParser embeddedReader = new HCardParser(element, pageUrl);
379                try {
380                        return embeddedReader.readNext();
381                } finally {
382                        warnings.addAll(embeddedReader.getWarnings());
383                        IOUtils.closeQuietly(embeddedReader);
384                }
385        }
386
387        private void parseProperty(Element element, String className) {
388                if (urlPropertyName.equals(className)) {
389                        VCardProperty impp = tryToParseAsImpp(element);
390                        if (impp != null) {
391                                vcard.addProperty(impp);
392                                return;
393                        }
394                }
395
396                VCardPropertyScribe<? extends VCardProperty> scribe = getPropertyScribe(className);
397                if (scribe == null) {
398                        //it's a CSS class that's unrelated to hCard
399                        return;
400                }
401
402                context.getWarnings().clear();
403                context.setPropertyName(scribe.getPropertyName());
404
405                VCardProperty property;
406                try {
407                        property = scribe.parseHtml(new HCardElement(element), context);
408                } catch (SkipMeException e) {
409                        //@formatter:off
410                        warnings.add(new ParseWarning.Builder(context)
411                                .message(22, e.getMessage())
412                                .build()
413                        );
414                        //@formatter:on
415
416                        return;
417                } catch (CannotParseException e) {
418                        //@formatter:off
419                        warnings.add(new ParseWarning.Builder(context)
420                                .message(e)
421                                .build()
422                        );
423                        //@formatter:on
424
425                        property = new RawProperty(className, element.outerHtml());
426                        vcard.addProperty(property);
427                        return;
428                } catch (EmbeddedVCardException e) {
429                        if (isChildOf(element, embeddedVCards)) {
430                                //prevents multiple-nested embedded elements from overwriting each other
431                                return;
432                        }
433
434                        property = e.getProperty();
435
436                        VCard embeddedVCard = parseEmbeddedVCard(element);
437                        e.injectVCard(embeddedVCard);
438                        vcard.addProperty(property);
439                        return;
440                }
441
442                warnings.addAll(context.getWarnings());
443
444                /*
445                 * LABELs must be treated specially so they can be matched up with their
446                 * ADRs.
447                 */
448                if (property instanceof Label) {
449                        handleLabel((Label) property);
450                        return;
451                }
452
453                /*
454                 * Add all NICKNAMEs to the same type object.
455                 */
456                if (property instanceof Nickname) {
457                        handleNickname((Nickname) property);
458                        return;
459                }
460
461                /*
462                 * Add all CATEGORIES to the same type object.
463                 */
464                if (property instanceof Categories) {
465                        handleCategories((Categories) property);
466                        return;
467                }
468
469                vcard.addProperty(property);
470        }
471
472        private void handleLabel(Label property) {
473                labels.add(property);
474        }
475
476        private void handleNickname(Nickname property) {
477                if (nickname == null) {
478                        nickname = property;
479                        vcard.addProperty(nickname);
480                } else {
481                        nickname.getValues().addAll(property.getValues());
482                }
483        }
484
485        private void handleCategories(Categories property) {
486                if (categories == null) {
487                        categories = property;
488                        vcard.addProperty(categories);
489                } else {
490                        categories.getValues().addAll(property.getValues());
491                }
492        }
493
494        public void close() {
495                //empty
496        }
497}