WEB MINING
SUBJECT CODE : CIE - 431P
LAB FILE
Submitted in
Department of Computer Science & Engineering
SUBMITTED TO:- SUBMITTED BY:-
NAME :- Nitin Sharma
Enroll.No :-03027202721
Class :-B-TECH(CSE-A)
INDEX
S.No AIM OF EXPERIMENT Page No Signature
Implement Page Rank Algorithm in Web Mining
1 1
Analyze the link structure of web using page rank al
gorithms. 2-3
2
Text and webpage pre‐processing
3 4-5
Social network analysis
4 6-7
Opinion mining
5 8
Sentiment analysis
6 9
Privatization of web content
7 10
Web usage mining
8 11
Recommender system
9 12
Web structure mining
10 13
EXPERIMENT - 1
CODE : Implement Page Rank Algorithm in Web Mining
import numpy as np
def pageRank(M, num_iter: int = 100, d: float = 0.85):
""" Parameters
M : numpy array
adjacency matrix where M[i,j] represents the link from 'j' to 'i', such that for all 'j' ->
sum(i, M[i,j]) = 1
num_iter : int, optional
number of iterations (default 100)
d : float, optional
damping factor (default 0.85)
Returns
numpy array
vector of ranks such that v[i] is the i-th rank from [0, 1],
v sums to 1
N = M.shape[1]
v = np.ones(N) / N
M_hat = (d * M + (1 - d) / N)
for i in range(num_iter):
"""
v = M_hat @ v
return v
M = np.array([[0, 0, 0, 0, 1],
[0.5, 0, 0, 0, 0],
[0.5, 1, 0, 0, 0],
[0, 0, 1, 0.5, 0],
[0, 0, 0, 0.5, 0]])
v = pageRank(M, 100, 0.85)
print(v)
OUTPUT
EXPERIMENT – 2
.
CODE : Analyze the link structure of web using page rank algorithms
import numpy as np
def create_transition_matrix(links, num_pages):
# Create an empty transition matrix
transition_matrix = np.zeros((num_pages, num_pages))
for page, outbound_links in links.items():
if outbound_links:
: # if the page has outbound links
for outbound_page in outbound_links:
transition_matrix[outbound_page][page] = 1 / len(outbound_links)
else: # if there are no outbound links (dangling page), distribute evenly
transition_matrix[:, page] = 1 / num_pages
return transition_matrix
def page_rank(links, num_pages, damping_factor=0.85, max_iterations=100, tol=1e-6):
# Create the transition matrix
transition_matrix = create_transition_matrix(links, num_pages)
# Initialize the rank vector with equal values
ranks = np.ones(num_pages) / num_pages
# PageRank formula includes a damping factor
for iteration in range(max_iterations):
new_ranks = (1 - damping_factor) / num_pages + damping_factor *
np.dot(transition_matrix, ranks)
# Check for convergence
if np.linalg.norm(new_ranks - ranks) < tol:
break
ranks = new_ranks
return ranks
# Example usage:
# Let's assume we have 4 pages, and the link structure is as follows:
# Page 0 has links to Page 1 and Page 2
# Page 1 has a link to Page 2
# Page 2 has a link to Page 0
# Page 3 has no outbound links
links = {
0: [1, 2],
1: [2],
2: [0],
3: []
}
num_pages = 4
ranks = page_rank(links, num_pages)
print("Page Ranks:", ranks)
OUTPUT :
EXPERIMENT -3
CODE: Text and webpage pre‐processing .
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
# Function to fetch and parse a web page
def fetch_web_page(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
# Function to extract links and clean text from a web page
def preprocess_web_page(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
links = set() # To avoid duplicate links
for link in soup.find_all('a', href=True):
links.add(link['href'])
return text, links
# Function to build a link structure (graph)
def build_link_structure(url_list):
link_structure = {}
for idx, url in enumerate(url_list):
html_content = fetch_web_page(url)
if html_content:
_, links = preprocess_web_page(html_content)
link_structure[idx] = [url_list.index(link) for link in links if link in url_list]
else:
link_structure[idx] = []
return link_structure
# Example list of web pages to process
urls = [
'https://example.com/page1',
'https://example.com/page2',
'https://example.com/page3'
]
link_structure = build_link_structure(urls)
print("Link Structure:", link_structure)
OUTPUT
EXPERIMENT - 4
CODE : Social network analysis
import networkx as nx
import matplotlib.pyplot as plt
# Step 1: Create a Social Network Graph
def create_social_network():
G = nx.Graph()
# Adding nodes (individuals)
G.add_nodes_from(["Alice", "Bob", "Charlie", "David", "Eve", "Frank"])
# Adding edges (relationships)
G.add_edges_from([("Alice", "Bob"),
("Alice", "Charlie"),
("Bob", "David"),
("Charlie", "David"),
("David", "Eve"),
("Eve", "Frank"),
("Frank", "Alice")]) # Alice is connected back to Frank
return G
# Step 2: Compute Centrality Measures
def compute_centralities(G):
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
return degree_centrality, closeness_centrality, betweenness_centrality
# Step 3: Visualize the Social Network
def visualize_network(G, centrality):
pos = nx.spring_layout(G) # Layout for node positions
plt.figure(figsize=(8, 6))
# Draw the nodes with varying sizes based on centrality
node_size = [v * 3000 for v in centrality.values()]
nx.draw(G, pos, with_labels=True, node_size=node_size, node_color='skyblue',
font_weight='bold')
plt.title("Social Network Visualization")
plt.show()
# Main Execution
if __name__ == "__main__":
# Create the social network
G = create_social_network()
# Compute centrality measures
degree_centrality, closeness_centrality, betweenness_centrality = compute_centralities(G)
# Print centrality measures
print("Degree Centrality:", degree_centrality)
print("Closeness Centrality:", closeness_centrality)
print("Betweenness Centrality:", betweenness_centrality)
# Visualize the network based on Degree Centrality
visualize_network(G, degree_centrality)
OUTPUT
Degree Centrality: {'Alice': 0.6000000000000001, 'Bob': 0.4, 'Charlie': 0.4,
'David': 0.6000000000000001, 'Eve': 0.4, 'Frank': 0.4}
Closeness Centrality: {'Alice': 0.7142857142857143, 'Bob': 0.625, 'Charlie':
0.625, 'David': 0.7142857142857143, 'Eve': 0.625, 'Frank': 0.625}
Betweenness Centrality: {'Alice': 0.25, 'Bob': 0.05, 'Charlie': 0.05, 'David':
0.25, 'Eve': 0.1, 'Frank': 0.1}
EXPERIMENT – 5
CODE : Opinion mining
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
data = {
'Text': [
'I love this product! It works wonderfully.',
'This is the worst service I have ever used.',
'The experience was okay, nothing special.',
'Fantastic quality and amazing performance.',
'I am very disappointed with the purchase.'
],
'Sentiment': ['positive', 'negative', 'neutral', 'positive', 'negative']
}
df = pd.DataFrame(data)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Text'])
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)
OUTPUT:
Accuracy: 1.0
Classification Report:
precision recall f1-score support
negative 1.00 1.00 1.00 1
accuracy 1.00 1
macro avg 1.00 1.00 1.00 1
weighted avg 1.00 1.00 1.00 1
EXPERIMENT – 6
CODE : Sentiment analysis
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
sentences = [
"I absolutely love this! It's fantastic.",
"This is the worst thing ever.",
"I'm feeling pretty neutral about this.",
"What an amazing experience!",
"I hate it when this happens."
]
for sentence in sentences:
sentiment = sia.polarity_scores(sentence)
print(f"Sentence: {sentence}")
print(f"Sentiment Scores: {sentiment}\n")
OUTPUT :
[nltk_data] Downloading package vader_lexicon to
[nltk_data] C:\Users\AppData\Roaming\nltk_data...
Sentence: I absolutely love this! It's fantastic.
Sentiment Scores: {'neg': 0.0, 'neu': 0.264, 'pos': 0.736, 'compound': 0.855}
Sentence: This is the worst thing ever.
Sentiment Scores: {'neg': 0.451, 'neu': 0.549, 'pos': 0.0, 'compound': -0.6249}
Sentence: I'm feeling pretty neutral about this.
Sentiment Scores: {'neg': 0.0, 'neu': 0.46, 'pos': 0.54, 'compound': 0.5719}
Sentence: What an amazing experience!
Sentiment Scores: {'neg': 0.0, 'neu': 0.423, 'pos': 0.577, 'compound': 0.6239}
Sentence: I hate it when this happens.
Sentiment Scores: {'neg': 0.481, 'neu': 0.519, 'pos': 0.0, 'compound': -0.5719}
EXPERIMENT – 7
CODE : Privatization of web content
from flask import Flask, request, jsonify, session, redirect, url_for
from werkzeug.security import generate_password_hash, check_password_hash
app = Flask( name )
app.secret_key = 'your_secret_key'
users = {
"user1": generate_password_hash("password1"),
"user2": generate_password_hash("password2"),
}
private_content = {
"user1": "This is private content for user1.",
"user2": "This is private content for user2."
}
@app.route('/')
def home():
return "Welcome to the Web Content Privatization Demo! Please log in."
@app.route('/login', methods=['POST'])
def login():
username = request.form['username']
password = request.form['password']
if username in users and check_password_hash(users[username], password):
session['username'] = username
return jsonify({"message": "Login successful!"})
return jsonify({"message": "Invalid credentials!"}), 401
@app.route('/logout')
def logout():
session.pop('username', None)
return redirect(url_for('home'))
@app.route('/private')
def private():
if 'username' in session:
user = session['username']
return jsonify({"content": private_content[user]})
return jsonify({"message": "Unauthorized access!"}), 401
if name == ' main ':
app.run(debug=True)
OUTPUT:
EXPERIMENT – 8
CODE : Web usage mining
import pandas as pd
from datetime import datetime
data = {
'UserID': [1, 1, 1, 2, 2, 3, 3, 3, 3],
'Page': ['Home', 'Product', 'Cart', 'Home', 'About', 'Home', 'Product', 'Cart', 'Checkout'],
'Timestamp': [
'2024-11-01 10:00:00', '2024-11-01 10:05:00', '2024-11-01 10:10:00',
'2024-11-01 10:00:00', '2024-11-01 10:07:00',
'2024-11-01 11:00:00', '2024-11-01 11:02:00', '2024-11-01 11:05:00', '2024-11-01 11:10:00'
]
}
df = pd.DataFrame(data)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Next_Timestamp'] = df.groupby('UserID')['Timestamp'].shift(-1)
df['Session_Duration'] = (df['Next_Timestamp'] - df['Timestamp']).dt.total_seconds().fillna(0)
page_counts = df['Page'].value_counts()
session_duration = df.groupby('UserID')['Session_Duration'].sum()
print("Most Visited Pages:\n", page_counts)
print("\nAverage Session Duration per User:\n", session_duration)
OUTPUT:
Most Visited Pages:
Page
Home 3
Product 2
Cart 2
About 1
Checkout 1
Name: count, dtype: int64
Average Session Duration per User:
UserID
1 600.0
2 420.0
3 600.0
Name: Session_Duration, dtype: float64
EXPERIMENT – 9
CODE : Recommender system
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
data = {
'User': ['User1', 'User1', 'User1', 'User2', 'User2', 'User3', 'User3', 'User3', 'User3'],
'Item': ['Item1', 'Item2', 'Item3', 'Item1', 'Item4', 'Item1', 'Item2', 'Item3', 'Item4'],
'Rating': [5, 3, 4, 4, 5, 5, 4, 3, 2]
}
df = pd.DataFrame(data)
user_item_matrix = df.pivot_table(index='User', columns='Item', values='Rating').fillna(0)
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index,
columns=user_item_matrix.index)
def recommend(user, user_similarity_df, user_item_matrix, n_recommendations=2):
scores = user_similarity_df[user].sort_values(ascending=False)
similar_users = scores.index[scores > 0].tolist()
recommendations = pd.Series(dtype=float)
for similar_user in similar_users:
weighted_ratings = user_item_matrix.loc[similar_user] * scores[similar_user]
recommendations = recommendations.add(weighted_ratings, fill_value=0)
already_rated = user_item_matrix.loc[user]
recommendations = recommendations[already_rated == 0].sort_values(ascending=False)
return recommendations.head(n_recommendations)
print("Recommendations for User1:\n", recommend('User1', user_similarity_df, user_item_matrix))
OUTPUT:
Recommendations for User1:
Item
Item4 4.094641
dtype: float64
EXPERIMENT – 10
CODE: Web structure mining
import networkx as nx
G = nx.DiGraph()
G.add_nodes_from(["PageA", "PageB", "PageC", "PageD", "PageE"])
G.add_edges_from([
("PageA", "PageB"),
("PageA", "PageC"),
("PageB", "PageC"),
("PageC", "PageA"),
("PageC", "PageD"),
("PageD", "PageE"),
("PageE", "PageD")
])
pagerank = nx.pagerank(G, alpha=0.85)
for page, rank in pagerank.items():
print(f"{page}: {rank:.4f}")
OUTPUT:
PageA: 0.0805
PageB: 0.0642
PageC: 0.1188
PageD: 0.3819
PageE: 0.3546