justwatch 웹사이트를 파싱하려고 합니다.
조회수 1063회
안녕하세요 파이썬 selenium 으로 웹페이지를 읽어서 변동사항 알림을 받으려고 합니다.
해당 웹페이지는 justwatch.com이고,
실제 크롬에서 html 소스보기로 살펴보면
<body ng-class="{promotion: isPromotionActive && !bannerState.isHidden}">
<img src="https://apis.justwatch.com/checkip/warmup" alt="" style="display: block; width: 0; height: 0;">
<!-- loading spinner -->
<div class="jw__spinner" ng-hide="true">
<svg version="1.0" xmlns="http://www.w3.org/2000/svg"
width="80px" height="80px" viewBox="0 0 1024.000000 1024.000000"
preserveAspectRatio="xMidYMid meet"
class="shimmer"
style="position: relative; z-index: 2;">
<g transform="translate(0.000000,1024.000000) scale(0.100000,-0.100000)" fill="#0C151D" stroke="none">
<path d="M0 5120 l0 -5120 5120 0 5120 0 0 5120 0 5120 -5120 0 -5120 0 0
-5120z m2493 3285 c70 -18 69 -18 689 -333 318 -162 505 -263 518 -279 26 -34
25 -65 -2 -90 -38 -34 -1460 -765 -1499 -770 -28 -4 -41 -1 -58 16 -21 21 -21
24 -21 609 0 647 0 647 62 745 66 104 171 138 311 102z m1626 -804 c326 -165
1186 -613 1212 -632 33 -23 59 -67 59 -99 0 -58 -72 -99 -1066 -606 -305 -155
-347 -174 -391 -174 -58 0 -88 18 -102 63 -7 22 -9 258 -7 738 5 829 -2 776
110 778 41 1 69 -10 185 -68z m1587 -805 c22 -8 327 -163 679 -345 674 -348
715 -373 715 -436 0 -59 -52 -93 -540 -345 -895 -463 -905 -467 -971 -440 -60
25 -59 10 -59 786 0 617 2 710 16 739 25 54 84 69 160 41z m-3341 -73 c263
-131 1187 -611 1233 -640 66 -42 91 -87 72 -133 -7 -17 -27 -42 -44 -54 -33
-25 -832 -440 -1164 -605 -161 -80 -213 -101 -246 -101 -51 0 -81 20 -95 63
-7 22 -10 258 -7 745 4 788 1 752 69 770 47 14 82 5 182 -45z m5120 -874 c586
-299 987 -507 1021 -530 62 -42 132 -125 140 -165 16 -88 -17 -156 -110 -223
-26 -18 -297 -162 -604 -318 -611 -313 -629 -320 -671 -268 l-21 28 2 749 3
750 29 25 c39 33 61 28 211 -48z m-3440 21 c33 -15 330 -167 660 -338 639
-330 685 -358 685 -420 0 -72 -39 -94 -1076 -623 -302 -154 -336 -169 -383
-169 -43 0 -56 5 -75 25 -13 14 -26 38 -29 53 -3 15 -3 344 -1 731 5 776 3
752 67 769 43 12 80 5 152 -28z m-1645 -895 c271 -136 1193 -616 1222 -636 17
-12 37 -37 44 -56 13 -30 13 -38 -2 -68 -9 -20 -35 -46 -62 -63 -53 -33 -908
-476 -1170 -606 -140 -69 -183 -86 -218 -86 -54 0 -89 30 -99 81 -3 19 -5 350
-3 736 3 694 3 702 24 730 44 58 95 52 264 -32z m3961 -309 c359 -186 666
-347 682 -358 31 -22 57 -67 57 -98 0 -50 -59 -87 -490 -311 -757 -392 -921
-472 -973 -473 -38 -1 -52 4 -74 27 l-28 27 -3 708 c-2 389 0 722 3 741 4 18
18 44 31 58 23 22 32 25 83 21 52 -4 115 -34 712 -342z m-1710 -860 c359 -186
668 -349 685 -361 37 -28 59 -75 50 -109 -3 -14 -23 -39 -44 -56 -64 -54
-1306 -691 -1385 -711 -57 -14 -95 2 -119 51 -16 32 -17 87 -14 745 4 841 -4
788 117 782 51 -3 107 -30 710 -341z m-1729 -861 c381 -197 712 -368 736 -382
80 -46 73 -108 -17 -156 -219 -117 -1064 -543 -1108 -558 -214 -73 -366 11
-412 229 -7 35 -11 240 -11 616 0 562 0 563 22 591 18 23 27 26 59 22 22 -3
317 -149 731 -362z"/>
</g>
</svg>
</div>
이렇게 나오지만,
웹페이지 요소 검사로는 정상적인 html 로 볼수있습니다.
<div class="timeline__timeframe timeline--2018-12-16" ng-repeat="day in $ctrl.cinemaReleases.concat($ctrl.getCurrentTitles().items) | orderBy:[ ['cinema-upcoming'].indexOf($ctrl.activeCollectionType) !== -1 ? 'date' : '-date' ]" ng-if="(
['cinema-upcoming'].indexOf($ctrl.activeCollectionType) !== -1) ||
(day.date >= $ctrl.currentDate && day.titlelist_name && !$ctrl.timelineHideCinemaProps.hideTimelineCinema) ||
(day.date >= $ctrl.currentDate && !(day.titlelist_name)
)" ng-init="($index + 1) === $ctrl.getCurrentTitles().items.length && $ctrl.hasRendered()">
<!-- HEADING -->
<timeline-header active-collection-type="::$ctrl.activeCollectionType" day="day">
<!-- TITLE LIST AND CINEMA UPCOMING -->
<!---->
<!-- TITLE LIST AND NOT CINEMA UPCOMING -->
<!---->
<!-- NO TITLE LIST NAME -->
<!----><span ng-if="::!$ctrl.day.titlelist_name">
<!----><span class="timeline__timeframe__head" ng-if="::($ctrl.day.date == $ctrl.today)" translate="WEBAPP_TODAY" translate-default="Today">오늘</span><!---->
<!---->
<!---->
</span><!---->
</timeline-header>
<!--Sticky Specials-->
<!-- @todo check if $index == 0 is better than $ctrl.today -->
<!----><div ng-if="(day.date === $ctrl.today && !day.titlelist_name) && $ctrl.stateParams.timeline_type !== 'pricedrops'">
<!---->
</div><!---->
<!--Regular Specials-->
<!---->
<!----><!----><div style="width: 100%;" class="timeline__provider-block timeline--2018-12-16--8" ng-repeat="provider in ::day.providers track by (provider.provider_id + '-' + provider.total)" ng-if="!isHidden(provider.id) || inBigBrotherMode === false">
<div class="timeline__provider-block__logo">
<!----><!----><img ng-repeat="icon in ::$ctrl.filters.providers | filter:{id: provider.provider_id}:true" class="price-comparison__grid__row__icon" alt="Netflix" ng-src="https://images.justwatch.com/icon/430997/s25" ng-if="::(!isCloaked(provider.provider_id) || inBigBrotherMode === false && !day.titlelist_name)" src="https://images.justwatch.com/icon/430997/s25"><!----><!---->
<!---->
<!-- only show this if provider.short_name is cloaked -->
<!---->
</div>
<div class="timeline__provider-block__head">
<span>1</span>
<span translate="WEBAPP_TITLE" translate-default="titles">개 영상</span>
</div>
<horizontal-titles-row titles="provider.items" track-action-name="new.2018-12-16.8" infinite-scroll="$ctrl.addMoreItemsHorizontal(day.date, provider.provider_id)" infinite-scroll-disabled="provider.items.length == provider.total" infinite-scroll-distance="0.4">
<horizontal-scrollable scrollable-id="::$ctrl.trackActionName" horizontal-scrollable-style="basic-gradient always-right-visible">
<div class="horizontal-scroll timeline__provider-block__titles__wrapper basic-gradient always-right-visible" ng-class="{
'timeline__provider-block__titles__wrapper--nav-left': $ctrl.scrollers.left,
'timeline__provider-block__titles__wrapper--nav-right': $ctrl.scrollers.right
}">
<!---->
<!---->
<ng-transclude>
<div class="horizontal-scrollable-container timeline__provider-block__titles" style="padding-right: 15px; margin-top: -17px; bottom: -17px;" ng-class="{'horizontal-scrollable-container--has-more': !$ctrl.infiniteScrollDisabled}" horizontal-infinite-scroll="$ctrl.infiniteScroll()" horizontal-infinite-scroll-disabled="$ctrl.infiniteScrollDisabled" horizontal-infinite-scroll-distance="::$ctrl.infiniteScrollDistance">
<!----><div class="main-content__poster" style="position: relative !important;" ng-repeat="title in $ctrl.titles" snowplow-data-contextable="" contextable-type="title" contextable-data="::title">
<title-card target-title="::title" responsive-image="::true" upon-click="$ctrl.uponClickCard({title: title})">
<!-- poster -->
<div snowplow-data-contextable="" contextable-type="title" contextable-data="::$ctrl.targetTitle" class="pos-relative">
<watchlist-ribbon target-title="::$ctrl.targetTitle">
<div class="bookmark" ng-click="$ctrl.toggle($ctrl.contentType, $ctrl.contentId)">
</div>
</watchlist-ribbon>
</div>
<track-title-control object-type="::$ctrl.targetTitle.object_type" object-id="::$ctrl.targetTitle.id" target-title="::$ctrl.targetTitle" excludes="::$ctrl.excludes" layout="card">
<!----><div class="track-title-control track-title-control--hidden" ng-if="::($ctrl.layout === 'card')" ng-class="{
'track-title-control--seen': $ctrl.props.inLocalSeenlist,
'track-title-control--tracked': $ctrl.props.inLocalWatchlist,
'track-title-control--hidden': $ctrl.getActiveVariant('SEENLIST_1') <= 0
}">
<div ng-transclude="">
<div ng-class="::{'main-content__poster__image': $ctrl.responsiveImage}" style="position: relative;" class="main-content__poster__image">
<a ng-href="/kr/TV-프로그램/sky-kaeseul/시즌-1" ng-click="$ctrl.uponClick(title)" href="/kr/TV-프로그램/sky-kaeseul/시즌-1">
</a><div class="main-content__poster__image__container" style="position: relative;"><a ng-href="/kr/TV-프로그램/sky-kaeseul/시즌-1" ng-click="$ctrl.uponClick(title)" href="/kr/TV-프로그램/sky-kaeseul/시즌-1">
<tv-show-badge target-title="::$ctrl.targetTitle">
<div class="tv-show-badge">
<div class="tv-show-badge__box">
<!----><div ng-if="::($ctrl.collectionType && $ctrl.targetTitle.object_type == 'show_season' && $ctrl.newElementCount >= 1)" class="tv-show-badge__box__new">
<!---->
<!----><span ng-if="::$ctrl.newElementCount == 1" translate="WEBAPP_NEW_EPISODE" translate-default="New episode">새 에피소드</span><!---->
</div><!---->
<!----><div ng-if="::(['show', 'show_season'].indexOf($ctrl.targetTitle.object_type) !== -1)" class="tv-show-badge__box__season">
<!----><div ng-if="::($ctrl.collectionType && $ctrl.targetTitle.object_type == 'show_season')">
<span translate="WEBAPP_SEASON" translate-default="season">시즌</span> 1
</div><!---->
<!---->
</div><!---->
</div>
</div>
</tv-show-badge>
<img src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" style="width: 100%; width: 100%; padding-bottom: 42%;">
<!----><img class="img-responsive poster notransition" style="width: 100%; height: auto; position: absolute; top: 0px; opacity: 1;" alt="Season 1" ng-if="::$ctrl.targetTitle.poster" ng-src="https://images.justwatch.com/poster/97661109/s166/시즌-1" fader-on-load="" src="https://images.justwatch.com/poster/97661109/s166/시즌-1"><!---->
<!---->
<!-- pricedrops -->
</a><!---->
</div>
<!-- showtimes -->
<!---->
</div>
</div>
<!----><div class="track-title-control__container" ng-if="::$ctrl.excludes.indexOf('track-title-control') === -1">
<div class="row track-title-control__container__row">
<div style="display: flex; width: 100%; background-color: rgba(38, 48, 59, 0.4);">
<div class="track-title-control__container__row--track" ng-click="$ctrl.toggleWatched()">
<span class="glyphicon glyphicon-bookmark"></span>
<!----><span style="opacity: 0.8; font-size: 12px;" ng-if="::($ctrl.objectType !== 'movie')" translate="WEBAPP_SEENLIST_TRACK_SHOW" translate-default="Track show">Track show</span><!---->
</div>
<!---->
</div>
</div>
</div><!---->
<div class="clear-fix"></div>
</div><!---->
<!---->
</track-title-control>
<!-- TODO extend with pricedrops for new.tpl.html -->
</title-card>
</div><!---->
</div>
</ng-transclude>
</div>
</horizontal-scrollable>
<!---->
</horizontal-titles-row>
<!----><new-timeline-ad-block style="width: 100%; display: block;" ng-if="['cinema-upcoming'].indexOf($ctrl.activeCollectionType) === -1" index="$parent.$parent.$parent.$index * 100 + $index" first-ad-at="0" every-x-buckets="4" caps-at="3">
<!----><jw-ad-block ng-if="::$ctrl.hasAdBlock && ['us', 'ca', 'au', 'xxx'].indexOf($ctrl.webLocale) === -1" position="timeline_inbetween" sizes="::['728x90', '300x250']" container-style="margin-top: 12px;">
<!----><div ng-if="$ctrl.isShown && !$ctrl.isExcludedFromAds" class="jw-ad-block row" style="margin-top: 12px;;">
<div class="jw-ad-block__container" style="width: 728px; height: 90px;">
<!----><div data-ng-show="adFitInViewport" class="ads" ng-if="$ctrl.isLoaded && $ctrl.adProvider === 'adsense'" ad-client="ca-pub-3179068936475881" ad-slot="2516381812" inline-style="display: inline-block; width: 728px; height: 90px;"><ins data-ng-class="{'adsbygoogle': adFitInViewport}" data-ad-client="ca-pub-3179068936475881" data-ad-slot="2516381812" ng-attr-data-ad-format="{{adFormat || undefined}}" style="display: inline-block; width: 728px; height: 90px;" "="" class="adsbygoogle"></ins></div><!---->
<!---->
</div>
<div class="jw-ad-block__container__remove">
<a href="" ng-click="$ctrl.openLoginModal()">
<span class="glyphicon glyphicon-remove"></span> <span translate="WEBAPP_LOGINS_REMOVE" translate-default="Remove this ad">Remove this ad</span>
</a>
</div>
</div><!---->
</jw-ad-block><!---->
<!---->
</new-timeline-ad-block><!---->
</div><!----><!---->
</div>
혹시 selenium으로 아랫쪽 소스를 볼수 있을까요?
가능성이 없다면, 최소한 불가능하다는 사실만이라도 알고싶습니다.
뭐라고 검색해야할지 감도 못잡겠어서.. 염치불구하고 질문글 남깁니다.
-
(•́ ✖ •̀)
알 수 없는 사용자
1 답변
-
질문이라는 것이 가능여부인가요?
당연히 가능합니다.
selenium 이라면 직접 브라우져를 핸들링하는 것입니다.
import io from selenium iimport webdriver driver = webdriver.... driver.get('https://www.justwatch.com/kr') with io.open('/home/allinux/test.html', mode='w', encoding='utf-8') as f: f.write(driver.page_source)
- 네 selenium 으로 시도해서 나온 결과물이 실제 화면에 나오는 내용과 차이가 있어서 질문글을 올렸습니다 ^^;; 알 수 없는 사용자 2018.12.17 15:59
- 실제 웹 브라우저에서 표시되는 내용은 아래 코드이고, 우클릭-소스보기로 볼수있는 내용은 위의 코드입니다. selenium으로 get 해오면 위의 코드가 나오지만 실제로 필요한 정보는 아래 코드라.. 아래 코드를 보고싶은데 쉽지가 않네요ㅜ 알 수 없는 사용자 2018.12.17 16:01
댓글 입력