0% found this document useful (0 votes)

93 views15 pages

Web Mining Practical File (NS)

The document outlines a lab file for a course on Web Mining, detailing various experiments including the implementation of the Page Rank algorithm, text preprocessing, social network analysis, opinion and sentiment mining, privatization of web content, web usage mining, and a recommender system. Each experiment includes code snippets and expected outputs, demonstrating practical applications of web mining techniques. The lab is submitted by Nitin Sharma from the Department of Computer Science & Engineering.

Uploaded by

Rahul Kumar

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

93 views15 pages

Web Mining Practical File (NS)

Uploaded by

Rahul Kumar

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 15

WEB MINING

SUBJECT CODE : CIE - 431P

LAB FILE

Submitted in
Department of Computer Science & Engineering

SUBMITTED TO:- SUBMITTED BY:-

NAME :- Nitin Sharma

Enroll.No :-03027202721

Class :-B-TECH(CSE-A)
INDEX

S.No AIM OF EXPERIMENT Page No Signature

Implement Page Rank Algorithm in Web Mining

1 1

Analyze the link structure of web using page rank al

gorithms. 2-3
2

Text and webpage pre‐processing

3 4-5

Social network analysis

4 6-7

Opinion mining
5 8

Sentiment analysis
6 9

Privatization of web content

7 10

Web usage mining

8 11

Recommender system
9 12

Web structure mining

10 13
EXPERIMENT - 1

CODE : Implement Page Rank Algorithm in Web Mining

import numpy as np
def pageRank(M, num_iter: int = 100, d: float = 0.85):
""" Parameters

M : numpy array
adjacency matrix where M[i,j] represents the link from 'j' to 'i', such that for all 'j' ->
sum(i, M[i,j]) = 1
num_iter : int, optional

number of iterations (default 100)

d : float, optional
damping factor (default 0.85)
Returns
numpy array

vector of ranks such that v[i] is the i-th rank from [0, 1],
v sums to 1

N = M.shape[1]

v = np.ones(N) / N

M_hat = (d * M + (1 - d) / N)
for i in range(num_iter):
"""
v = M_hat @ v
return v
M = np.array([[0, 0, 0, 0, 1],
[0.5, 0, 0, 0, 0],
[0.5, 1, 0, 0, 0],
[0, 0, 1, 0.5, 0],

[0, 0, 0, 0.5, 0]])

v = pageRank(M, 100, 0.85)
print(v)

OUTPUT
EXPERIMENT – 2
.
CODE : Analyze the link structure of web using page rank algorithms

import numpy as np
def create_transition_matrix(links, num_pages):
# Create an empty transition matrix
transition_matrix = np.zeros((num_pages, num_pages))

for page, outbound_links in links.items():

if outbound_links:
: # if the page has outbound links
for outbound_page in outbound_links:
transition_matrix[outbound_page][page] = 1 / len(outbound_links)
else: # if there are no outbound links (dangling page), distribute evenly
transition_matrix[:, page] = 1 / num_pages

return transition_matrix

def page_rank(links, num_pages, damping_factor=0.85, max_iterations=100, tol=1e-6):

# Create the transition matrix
transition_matrix = create_transition_matrix(links, num_pages)

# Initialize the rank vector with equal values

ranks = np.ones(num_pages) / num_pages

# PageRank formula includes a damping factor

for iteration in range(max_iterations):
new_ranks = (1 - damping_factor) / num_pages + damping_factor *
np.dot(transition_matrix, ranks)

# Check for convergence

if np.linalg.norm(new_ranks - ranks) < tol:
break
ranks = new_ranks

return ranks

# Example usage:

# Let's assume we have 4 pages, and the link structure is as follows:

# Page 0 has links to Page 1 and Page 2
# Page 1 has a link to Page 2
# Page 2 has a link to Page 0
# Page 3 has no outbound links

links = {
0: [1, 2],
1: [2],
2: [0],
3: []
}

num_pages = 4
ranks = page_rank(links, num_pages)
print("Page Ranks:", ranks)

OUTPUT :
EXPERIMENT -3
CODE: Text and webpage pre‐processing .
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

# Function to fetch and parse a web page

def fetch_web_page(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None

# Function to extract links and clean text from a web page

def preprocess_web_page(html_content):
soup = BeautifulSoup(html_content, 'html.parser')

for script in soup(["script", "style"]):

script.decompose()

text = soup.get_text()
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space

links = set() # To avoid duplicate links

for link in soup.find_all('a', href=True):
links.add(link['href'])

return text, links

# Function to build a link structure (graph)
def build_link_structure(url_list):
link_structure = {}

for idx, url in enumerate(url_list):

html_content = fetch_web_page(url)

if html_content:
_, links = preprocess_web_page(html_content)
link_structure[idx] = [url_list.index(link) for link in links if link in url_list]
else:
link_structure[idx] = []

return link_structure

# Example list of web pages to process

urls = [
'https://example.com/page1',
'https://example.com/page2',
'https://example.com/page3'
]

link_structure = build_link_structure(urls)
print("Link Structure:", link_structure)

OUTPUT
EXPERIMENT - 4
CODE : Social network analysis
import networkx as nx
import matplotlib.pyplot as plt

# Step 1: Create a Social Network Graph

def create_social_network():
G = nx.Graph()

# Adding nodes (individuals)

G.add_nodes_from(["Alice", "Bob", "Charlie", "David", "Eve", "Frank"])

# Adding edges (relationships)

G.add_edges_from([("Alice", "Bob"),
("Alice", "Charlie"),
("Bob", "David"),
("Charlie", "David"),
("David", "Eve"),
("Eve", "Frank"),
("Frank", "Alice")]) # Alice is connected back to Frank

return G
# Step 2: Compute Centrality Measures

def compute_centralities(G):
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

betweenness_centrality = nx.betweenness_centrality(G)

return degree_centrality, closeness_centrality, betweenness_centrality

# Step 3: Visualize the Social Network

def visualize_network(G, centrality):
pos = nx.spring_layout(G) # Layout for node positions
plt.figure(figsize=(8, 6))
# Draw the nodes with varying sizes based on centrality

node_size = [v * 3000 for v in centrality.values()]

nx.draw(G, pos, with_labels=True, node_size=node_size, node_color='skyblue',

font_weight='bold')
plt.title("Social Network Visualization")
plt.show()
# Main Execution
if __name__ == "__main__":
# Create the social network
G = create_social_network()
# Compute centrality measures
degree_centrality, closeness_centrality, betweenness_centrality = compute_centralities(G)
# Print centrality measures
print("Degree Centrality:", degree_centrality)
print("Closeness Centrality:", closeness_centrality)
print("Betweenness Centrality:", betweenness_centrality)
# Visualize the network based on Degree Centrality
visualize_network(G, degree_centrality)
OUTPUT

Degree Centrality: {'Alice': 0.6000000000000001, 'Bob': 0.4, 'Charlie': 0.4,

'David': 0.6000000000000001, 'Eve': 0.4, 'Frank': 0.4}
Closeness Centrality: {'Alice': 0.7142857142857143, 'Bob': 0.625, 'Charlie':
0.625, 'David': 0.7142857142857143, 'Eve': 0.625, 'Frank': 0.625}

Betweenness Centrality: {'Alice': 0.25, 'Bob': 0.05, 'Charlie': 0.05, 'David':

0.25, 'Eve': 0.1, 'Frank': 0.1}
EXPERIMENT – 5
CODE : Opinion mining
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

data = {
'Text': [
'I love this product! It works wonderfully.',
'This is the worst service I have ever used.',
'The experience was okay, nothing special.',
'Fantastic quality and amazing performance.',
'I am very disappointed with the purchase.'
],
'Sentiment': ['positive', 'negative', 'neutral', 'positive', 'negative']
}

df = pd.DataFrame(data)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Text'])
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
OUTPUT:

Accuracy: 1.0
Classification Report:
precision recall f1-score support
negative 1.00 1.00 1.00 1

accuracy 1.00 1
macro avg 1.00 1.00 1.00 1
weighted avg 1.00 1.00 1.00 1
EXPERIMENT – 6
CODE : Sentiment analysis

from nltk.sentiment import SentimentIntensityAnalyzer

import nltk
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

sentences = [
"I absolutely love this! It's fantastic.",
"This is the worst thing ever.",
"I'm feeling pretty neutral about this.",
"What an amazing experience!",
"I hate it when this happens."
]

for sentence in sentences:

sentiment = sia.polarity_scores(sentence)
print(f"Sentence: {sentence}")
print(f"Sentiment Scores: {sentiment}\n")

OUTPUT :
[nltk_data] Downloading package vader_lexicon to
[nltk_data] C:\Users\AppData\Roaming\nltk_data...
Sentence: I absolutely love this! It's fantastic.
Sentiment Scores: {'neg': 0.0, 'neu': 0.264, 'pos': 0.736, 'compound': 0.855}

Sentence: This is the worst thing ever.

Sentiment Scores: {'neg': 0.451, 'neu': 0.549, 'pos': 0.0, 'compound': -0.6249}

Sentence: I'm feeling pretty neutral about this.

Sentiment Scores: {'neg': 0.0, 'neu': 0.46, 'pos': 0.54, 'compound': 0.5719}
Sentence: What an amazing experience!
Sentiment Scores: {'neg': 0.0, 'neu': 0.423, 'pos': 0.577, 'compound': 0.6239}

Sentence: I hate it when this happens.

Sentiment Scores: {'neg': 0.481, 'neu': 0.519, 'pos': 0.0, 'compound': -0.5719}
EXPERIMENT – 7
CODE : Privatization of web content
from flask import Flask, request, jsonify, session, redirect, url_for
from werkzeug.security import generate_password_hash, check_password_hash
app = Flask( name )
app.secret_key = 'your_secret_key'

users = {
"user1": generate_password_hash("password1"),
"user2": generate_password_hash("password2"),
}
private_content = {
"user1": "This is private content for user1.",
"user2": "This is private content for user2."
}
@app.route('/')
def home():
return "Welcome to the Web Content Privatization Demo! Please log in."
@app.route('/login', methods=['POST'])
def login():
username = request.form['username']
password = request.form['password']

if username in users and check_password_hash(users[username], password):

session['username'] = username
return jsonify({"message": "Login successful!"})
return jsonify({"message": "Invalid credentials!"}), 401
@app.route('/logout')
def logout():
session.pop('username', None)
return redirect(url_for('home'))

@app.route('/private')
def private():
if 'username' in session:
user = session['username']
return jsonify({"content": private_content[user]})
return jsonify({"message": "Unauthorized access!"}), 401

if name == ' main ':

app.run(debug=True)

OUTPUT:
EXPERIMENT – 8
CODE : Web usage mining
import pandas as pd
from datetime import datetime
data = {
'UserID': [1, 1, 1, 2, 2, 3, 3, 3, 3],
'Page': ['Home', 'Product', 'Cart', 'Home', 'About', 'Home', 'Product', 'Cart', 'Checkout'],
'Timestamp': [
'2024-11-01 10:00:00', '2024-11-01 10:05:00', '2024-11-01 10:10:00',
'2024-11-01 10:00:00', '2024-11-01 10:07:00',
'2024-11-01 11:00:00', '2024-11-01 11:02:00', '2024-11-01 11:05:00', '2024-11-01 11:10:00'
]
}

df = pd.DataFrame(data)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df['Next_Timestamp'] = df.groupby('UserID')['Timestamp'].shift(-1)
df['Session_Duration'] = (df['Next_Timestamp'] - df['Timestamp']).dt.total_seconds().fillna(0)

page_counts = df['Page'].value_counts()

session_duration = df.groupby('UserID')['Session_Duration'].sum()

print("Most Visited Pages:\n", page_counts)

print("\nAverage Session Duration per User:\n", session_duration)

OUTPUT:
Most Visited Pages:
Page
Home 3
Product 2
Cart 2
About 1
Checkout 1
Name: count, dtype: int64

Average Session Duration per User:

UserID
1 600.0
2 420.0
3 600.0
Name: Session_Duration, dtype: float64
EXPERIMENT – 9

CODE : Recommender system

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

data = {
'User': ['User1', 'User1', 'User1', 'User2', 'User2', 'User3', 'User3', 'User3', 'User3'],
'Item': ['Item1', 'Item2', 'Item3', 'Item1', 'Item4', 'Item1', 'Item2', 'Item3', 'Item4'],
'Rating': [5, 3, 4, 4, 5, 5, 4, 3, 2]
}

df = pd.DataFrame(data)
user_item_matrix = df.pivot_table(index='User', columns='Item', values='Rating').fillna(0)

user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index,
columns=user_item_matrix.index)

def recommend(user, user_similarity_df, user_item_matrix, n_recommendations=2):

scores = user_similarity_df[user].sort_values(ascending=False)
similar_users = scores.index[scores > 0].tolist()

recommendations = pd.Series(dtype=float)

for similar_user in similar_users:

weighted_ratings = user_item_matrix.loc[similar_user] * scores[similar_user]
recommendations = recommendations.add(weighted_ratings, fill_value=0)
already_rated = user_item_matrix.loc[user]
recommendations = recommendations[already_rated == 0].sort_values(ascending=False)

return recommendations.head(n_recommendations)
print("Recommendations for User1:\n", recommend('User1', user_similarity_df, user_item_matrix))

OUTPUT:
Recommendations for User1:
Item
Item4 4.094641
dtype: float64
EXPERIMENT – 10
CODE: Web structure mining
import networkx as nx
G = nx.DiGraph()
G.add_nodes_from(["PageA", "PageB", "PageC", "PageD", "PageE"])

G.add_edges_from([
("PageA", "PageB"),
("PageA", "PageC"),
("PageB", "PageC"),
("PageC", "PageA"),
("PageC", "PageD"),
("PageD", "PageE"),
("PageE", "PageD")
])
pagerank = nx.pagerank(G, alpha=0.85)

for page, rank in pagerank.items():

print(f"{page}: {rank:.4f}")

OUTPUT:
PageA: 0.0805
PageB: 0.0642
PageC: 0.1188
PageD: 0.3819
PageE: 0.3546

Web Mining Lab Source Code 1-12 PRINT
No ratings yet
Web Mining Lab Source Code 1-12 PRINT
43 pages
Web Mining and PageRank Guide
No ratings yet
Web Mining and PageRank Guide
31 pages
Web Mining
No ratings yet
Web Mining
27 pages
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
No ratings yet
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
11 pages
Sma 2
No ratings yet
Sma 2
9 pages
DWM
No ratings yet
DWM
12 pages
WSMA Lab Manual 2
No ratings yet
WSMA Lab Manual 2
8 pages
CSE 3024: Web Mining: Lab Assessment - 3
No ratings yet
CSE 3024: Web Mining: Lab Assessment - 3
13 pages
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
No ratings yet
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
4 pages
Unit Iv, V
No ratings yet
Unit Iv, V
35 pages
DWM Exp 10
No ratings yet
DWM Exp 10
3 pages
Name: Kartik Jolapara Sapid: Div: Branch
No ratings yet
Name: Kartik Jolapara Sapid: Div: Branch
4 pages
Irs 122010304057 PDF
No ratings yet
Irs 122010304057 PDF
23 pages
Mini-Project #3 - Pagerank: 1 Motivation
No ratings yet
Mini-Project #3 - Pagerank: 1 Motivation
3 pages
C++ PageRank Algorithm Guide
No ratings yet
C++ PageRank Algorithm Guide
2 pages
Erformance Valuation EB Rawler: P E O W C
No ratings yet
Erformance Valuation EB Rawler: P E O W C
34 pages
CCS363 SNS Lab Manual Record
No ratings yet
CCS363 SNS Lab Manual Record
32 pages
3252 Ids 10
No ratings yet
3252 Ids 10
5 pages
PageRank Algorithm Explained
No ratings yet
PageRank Algorithm Explained
9 pages
Dbms Review-3: G.BALAVIGNESH-10MSE1072 Harshavardhan-10Mse1077
No ratings yet
Dbms Review-3: G.BALAVIGNESH-10MSE1072 Harshavardhan-10Mse1077
35 pages
Web Mining Techniques Explained
No ratings yet
Web Mining Techniques Explained
9 pages
All Exp Lab
No ratings yet
All Exp Lab
15 pages
Python Notes
No ratings yet
Python Notes
10 pages
SNS Lab Anual
No ratings yet
SNS Lab Anual
33 pages
User Profiling
No ratings yet
User Profiling
15 pages
DWM Expt9
No ratings yet
DWM Expt9
6 pages
Vanessaa Wim
No ratings yet
Vanessaa Wim
9 pages
Implementing PageRank Algorithm
No ratings yet
Implementing PageRank Algorithm
7 pages
Project2 SimplifiedPageRank
No ratings yet
Project2 SimplifiedPageRank
6 pages
RajSingh WIexp4
No ratings yet
RajSingh WIexp4
7 pages
IR Practical Code
No ratings yet
IR Practical Code
13 pages
Internship Report
No ratings yet
Internship Report
27 pages
22f-3386 Lab 3 6BB
No ratings yet
22f-3386 Lab 3 6BB
20 pages
Name: Ojas Jayant Khawas Class: TY-C Roll No.:10 SRN No.:202100264 Title: Web Crawling and Page Indexing Using Breadth First Search
No ratings yet
Name: Ojas Jayant Khawas Class: TY-C Roll No.:10 SRN No.:202100264 Title: Web Crawling and Page Indexing Using Breadth First Search
7 pages
Matrix Multiplication & PageRank
No ratings yet
Matrix Multiplication & PageRank
3 pages
Web Crawler Toolkit for Developers
No ratings yet
Web Crawler Toolkit for Developers
6 pages
1745064423339-Coders of Delhi
No ratings yet
1745064423339-Coders of Delhi
12 pages
Graph Help Session
No ratings yet
Graph Help Session
27 pages
Lab Manual
No ratings yet
Lab Manual
10 pages
Data Aggregation by Web Scraping Using Python
No ratings yet
Data Aggregation by Web Scraping Using Python
48 pages
09.11.17 - Project - CSE 053 06715 - Adnan Ferdous Ashrafi
No ratings yet
09.11.17 - Project - CSE 053 06715 - Adnan Ferdous Ashrafi
41 pages
Unit I
No ratings yet
Unit I
12 pages
Web Crawler PY
No ratings yet
Web Crawler PY
27 pages
Web Scraping with Machine Learning
No ratings yet
Web Scraping with Machine Learning
4 pages
EXP-11-Implementation of Page Rank Algorithm
No ratings yet
EXP-11-Implementation of Page Rank Algorithm
8 pages
Sithfal-Task2 Explation Matter
No ratings yet
Sithfal-Task2 Explation Matter
6 pages
Python File
No ratings yet
Python File
11 pages
Final Project Report
No ratings yet
Final Project Report
43 pages
RajSingh WIexp1
No ratings yet
RajSingh WIexp1
7 pages
Web Personalization with UPR
No ratings yet
Web Personalization with UPR
9 pages
Jeffrey D. Ullman Stanford University
No ratings yet
Jeffrey D. Ullman Stanford University
44 pages
Crawler Thesis
No ratings yet
Crawler Thesis
188 pages
Solution Methodology
No ratings yet
Solution Methodology
3 pages
Report PDF
No ratings yet
Report PDF
35 pages
Programming 2 Lectures
No ratings yet
Programming 2 Lectures
52 pages
AI Lab Tasks for Python Developers
No ratings yet
AI Lab Tasks for Python Developers
12 pages
Gujarat Technological University: Computer Engineering Web Data Management B.E. 8 Semester
No ratings yet
Gujarat Technological University: Computer Engineering Web Data Management B.E. 8 Semester
3 pages
Software Testing-2
No ratings yet
Software Testing-2
36 pages
Computer Vision Course Overview
No ratings yet
Computer Vision Course Overview
6 pages
MAE 101 Homework 2
No ratings yet
MAE 101 Homework 2
1 page
Fo CS24
No ratings yet
Fo CS24
428 pages
Genetic Algorithm
No ratings yet
Genetic Algorithm
104 pages
2 1 Results
No ratings yet
2 1 Results
2 pages
Sparse and Low Rank
No ratings yet
Sparse and Low Rank
6 pages
Construction of Nfa and Dfa From R
100% (2)
Construction of Nfa and Dfa From R
15 pages
SPSS Test Outout
No ratings yet
SPSS Test Outout
18 pages
DSA Assignment 1
No ratings yet
DSA Assignment 1
2 pages
Mock Exam 2025 Timetable
0% (1)
Mock Exam 2025 Timetable
2 pages
L6-SA - en - Summative Assessment - Foundations Unit
No ratings yet
L6-SA - en - Summative Assessment - Foundations Unit
16 pages
PBL PPT
No ratings yet
PBL PPT
11 pages
On Performance Indicators of Multi-Objective Optimization
No ratings yet
On Performance Indicators of Multi-Objective Optimization
12 pages
B15-Content - Analysis - in - Social - Media (1) - Bbhavani
No ratings yet
B15-Content - Analysis - in - Social - Media (1) - Bbhavani
59 pages
Test 1 Sample
No ratings yet
Test 1 Sample
4 pages
Day 1
No ratings yet
Day 1
36 pages
Module 2 Lesson 1: The Kalman Filter
No ratings yet
Module 2 Lesson 1: The Kalman Filter
14 pages
Robotics & AI Research Portfolio
100% (1)
Robotics & AI Research Portfolio
2 pages
Preface
No ratings yet
Preface
4 pages
Models and Algorithms For Production Planning, Scheduling and Sequencing Problems - A Holistic Framework and A Systematic Review
No ratings yet
Models and Algorithms For Production Planning, Scheduling and Sequencing Problems - A Holistic Framework and A Systematic Review
17 pages
Active Machine Learning For Heterogeneity Activity
No ratings yet
Active Machine Learning For Heterogeneity Activity
13 pages
100 Days of Machine Learning
No ratings yet
100 Days of Machine Learning
45 pages
Check Zone in 7ss52-53 - Iec
No ratings yet
Check Zone in 7ss52-53 - Iec
2 pages
Modelling in Food Technology
No ratings yet
Modelling in Food Technology
3 pages
Grade 11 Math: Matrix Operations
No ratings yet
Grade 11 Math: Matrix Operations
3 pages
02 03 SampleQuiz
No ratings yet
02 03 SampleQuiz
2 pages
CS480 Crossword Solver Guide
No ratings yet
CS480 Crossword Solver Guide
20 pages
Analysis of Systems With Sector Nonlinearities
No ratings yet
Analysis of Systems With Sector Nonlinearities
20 pages
Calc 5.10 Packet
No ratings yet
Calc 5.10 Packet
4 pages

Web Mining Practical File (NS)

Uploaded by

Web Mining Practical File (NS)

Uploaded by

WEB MINING

SUBJECT CODE : CIE - 431P

SUBMITTED TO:- SUBMITTED BY:-

NAME :- Nitin Sharma

S.No AIM OF EXPERIMENT Page No Signature

Implement Page Rank Algorithm in Web Mining

Analyze the link structure of web using page rank al

Text and webpage pre‐processing

Social network analysis

Privatization of web content

Web usage mining

Web structure mining

CODE : Implement Page Rank Algorithm in Web Mining

number of iterations (default 100)

[0, 0, 0, 0.5, 0]])

for page, outbound_links in links.items():

def page_rank(links, num_pages, damping_factor=0.85, max_iterations=100, tol=1e-6):

# Initialize the rank vector with equal values

# PageRank formula includes a damping factor

# Check for convergence

# Let's assume we have 4 pages, and the link structure is as follows:

# Function to fetch and parse a web page

# Function to extract links and clean text from a web page

for script in soup(["script", "style"]):

links = set() # To avoid duplicate links

return text, links

for idx, url in enumerate(url_list):

# Example list of web pages to process

# Step 1: Create a Social Network Graph

# Adding nodes (individuals)

# Adding edges (relationships)

return degree_centrality, closeness_centrality, betweenness_centrality

# Step 3: Visualize the Social Network

node_size = [v * 3000 for v in centrality.values()]

nx.draw(G, pos, with_labels=True, node_size=node_size, node_color='skyblue',

Degree Centrality: {'Alice': 0.6000000000000001, 'Bob': 0.4, 'Charlie': 0.4,

Betweenness Centrality: {'Alice': 0.25, 'Bob': 0.05, 'Charlie': 0.05, 'David':

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from nltk.sentiment import SentimentIntensityAnalyzer

for sentence in sentences:

Sentence: This is the worst thing ever.

Sentence: I'm feeling pretty neutral about this.

Sentence: I hate it when this happens.

if username in users and check_password_hash(users[username], password):

if name == ' main ':

print("Most Visited Pages:\n", page_counts)

Average Session Duration per User:

CODE : Recommender system

def recommend(user, user_similarity_df, user_item_matrix, n_recommendations=2):

for similar_user in similar_users:

for page, rank in pagerank.items():

You might also like