1234567891011121314151617181920212223242526272829303132333435 |
- #!/usr/bin/python3
- # web-crawler-rap v0.0.1
- # Copyright (C) 2017 Özcan Oğuz, Zeynep Topsakal
- #
- # This program is free software: you can redistribute it and/or modify it under the terms of the GNU General
- # Public License as published by the Free Software Foundation, either version 3 of the License, or (at your
- # option) any later version.
- #
- # This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
- # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- # for more details.
- #
- # You should have received a copy of the GNU General Public License along with this program.
- # If not, see <http://www.gnu.org/licenses/>.
- from bs4 import BeautifulSoup;
- import requests;
- import urllib;
- import re;
- sarkiDosyasi = open('sarkilar.txt', 'r');
- sarki = sarkiDosyasi.readlines();
- for i in range(0, len(sarki)):
- source = requests.get(sarki[i]);
- icerik = source.content;
- crawler = BeautifulSoup(icerik,"html.parser");
- dosya = open('sozler/{}.txt'.format(crawler.html.head.title.string),'a');
- sozlerHam = crawler.find('div', {"class" : "reading"});
- sozler = str(sozlerHam);
- sarkiSozleri = re.sub('<.*?>', '\n', sozler);
- sarkiSozleri = re.sub('.*[;:]', '\n', sarkiSozleri);
- dosya.write(sarkiSozleri);
- print (i, "numaralı" , crawler.html.head.title.string , "yazıldı.");
|