Nk_traffic_breakdown

Home

Description

nk_traffic_breakdown() is a function for generating a report on the traffic of all domains hosted on a server. The script is divided into several functions, each of which extracts a different type of information from the log files of the domains. It uses the other functions to generate a table with the following information for each domain:

Domain: the domain name

Requests: the total number of requests made to the domain

GET/POST: the number of requests that were GET and POST

%-Requests: the percentage of requests made to the domain out of the total requests for all domains

Bandwidth: the total bandwidth used by the domain

%-Bandwidth: the percentage of bandwidth used by the domain out of the total bandwidth for all domains

XMLRPC: the number of requests made to the domain that contain “xmlrpc” in their URL

Bots: the number of requests made to the domain by bots, crawlers or spiders.

Example

[root@cloudvpsserver ~]# nk_traffic_breakdown
Domain     Requests  GET/POST     %-Requests  Bandwidth  %-Bandwidth  XMLRPC        Bots
nkern.net  8270      (3820/4288)  100%        77MB       100%         3529(42.6%%)  1358(16.4%%)

Code

nk_traffic_breakdown() {
# This is basically a wrapper around gen_traffic_breakdown_body that does formmating.
all_domains=$(nk_list_all_domains)
gen_traffic_breakdown_body() {

gen_latest_domlogs() {
domain="$1"
user="$(nk_user "$domain")"
archive_dir="/home/$user/logs"

# Find the two latest archived logs.
for archive_log in $(find "$archive_dir" -type f | grep "$archive_dir/$domain-"); do
    # Get the epoch time and filename of each archived log.
    stat --format="%Y %n" "$archive_log"
done | sort -rn | head -2 | awk '{print $2}'
# Sort them by that timestamp and take the two largest numbers. (most recent)
# We want the top two for https and http logs.
}

gen_num_requests() {
# Get the number of requests per log.
# This is the number of lines when both logs are catted together.
for log in $latest_logs ; do
    zcat "$log"
done | wc -l
}

gen_gets() {
# Get the number of GET Requests per log.
for log in $latest_logs ; do
    zgrep -c "GET" "$log"
# Then sum them together.
done | awk '{sum+=$1} END {print sum}'
}

gen_posts() {
# Get the number of POST requests per log.
for log in $latest_logs ; do
    zgrep -c "POST" "$log"
done | awk '{sum+=$1} END {print sum}'
}

gen_bandwidth() {
# Get the Total Bandwidth per log.
for log in $latest_logs; do
    zcat "$log" | awk '{print $10}' | grep -v "-"
done | awk '{sum+=$1} END {print sum}'
}

gen_total_requests() {
# Get the total number of requests across all recent domlogs.
gen_latest_domlogs_all() {
# Run gen_latest_domlogs on every domain on the server.
for domain in $all_domains ; do
    gen_latest_domlogs "$domain"
done
}

# For every log in the result of gen_latest_domlogs_all
for domlog in $(gen_latest_domlogs_all); do
    # Count the numbner of lines in the logs
    zcat "$domlog" | wc -l
    # Once the lines for each file have been printed, sum them.
done | awk '{sum+=$1} END {print sum}'
}

gen_total_bandwidth() {
# Generate the total bandwidth bewtween all matched logs.
gen_latest_domlogs_all() {
# Run gen_latest_domlogs for every domain on the server. Found via nk_list_all_domains.
for domain in $all_domains ; do
    gen_latest_domlogs "$domain"
done
}

# Now for all the logs found by gen_latest_domlogs_all
for domlog in $(gen_latest_domlogs_all); do
    # print out the value of the 10th field. Which is the bytes transferred.
    # remove any results that are "-" as they'll mess up our addition later.
    zcat "$domlog" | awk '{print $10}' | grep -v "-"
    # Once they've all been printed out, sum them.
done | awk '{sum+=$1} END {print sum}'
}

gen_xmlrpc() {
# Check how many xmlrpc hits are in the logs.
# First generate the list of all the matching domlogs.
for log in $latest_logs; do
    # then count the number of times "xmlrpc" is mentioned.
    zgrep -c "xmlrpc" "$log"
    # Lastly sum them together.
done | awk '{sum+=$1} END {print sum}'
}

gen_bots() {
# Check how many hits in the logs are from bots.
# For every log found by gen_latest_domlogs.
for log in $latest_logs; do
    # count the number of times that the string "bot" "crawl" or "spider" pops up in the log pops up in the log
    zgrep -cEi '(bot|crawl|spider)' "$log"
    # Once both logs have their count, sum them.
done | awk '{sum+=$1} END {print sum}'
}

# First calculate the values for total_bandwidth and total_requests.
# These are the same no matter which domain you're looking at so they're defined outside of the loop.
total_bandwidth="$(gen_total_bandwidth)"
total_requests="$(gen_total_requests)"

# Writ the header for our table.
echo "Domain | Requests GET/POST %-Requests | Bandwidth %-Bandwidth | XMLRPC Bots |"
echo "--- - --- --- --- - --- --- - --- --- -"
# Now for every domain on the server, found with nk_list_all_domains
for domain in $all_domains ; do
    # Define these variable.
    # requests : The number of requests in the domlogs. Found with gen_num_requests
    # gets : The numbers of those requests that are GET. Found via gen_gets
    # posts : The number of those requests that are Post. found via gen_posts.
    # gets_posts : gets and posts formmatted together as such (GETS/POSTS)
    # percent_requests : The site is responsible for what percent of total requests.
    # bandwidth : The amount of bandwidth in the site's logs. found via gen_bandwidth
    # bandwidth_human : bandwidth formmated into human readable. ie kb, mb, gb
    # bandwidth_percent : The bandwidth for the site is what percent of total bandwidth.
    # xmlrpc : The number of requests to xmlrpc in the log.
    # xmlrpc_percent : Percent of site requests that are xmlrpc.
    # xmlrpc_result : Format xmlrpc and xmlrpc into a single result. ie 213(13%)
    # bots : The number of requests that are bots.
    # bots_percent : The bots are what percent of total traffic to the site.
    # bots_result : Format bots, and bots percent together. ie 143(9%)
    latest_logs="$(gen_latest_domlogs "$domain")"
    requests="$(gen_num_requests)"
    gets="$(gen_gets)"
    posts="$(gen_posts)"
    gets_posts="($gets/$posts)"
    percent_requests="$(nk_percent "$requests" "$total_requests")"
    bandwidth="$(gen_bandwidth)"
    bandwidth_human="$(numfmt --to=iec --suffix=B "$bandwidth")"
    bandwidth_percent="$(nk_percent "$bandwidth" "$total_bandwidth")"
    xmlrpc="$(gen_xmlrpc)"
    xmlrpc_percent="$(nk_percent "$xmlrpc" "$requests")"
    xmlrpc_result="$xmlrpc($xmlrpc_percent%)"
    bots="$(gen_bots)"
    bots_percent="$(nk_percent "$bots" "$requests")"
    bots_result="$bots($bots_percent%)"

    # Print out row values using the variable we defined above.
    echo "$domain | $requests $gets_posts $percent_requests | $bandwidth_human $bandwidth_percent | $xmlrpc_result $bots_result |"
done | sort -rn -k 2
# Once the table rows have finished, sort them by highest request first.
}

# Run gen_traffic_breakdown_body and format it as a table.
gen_traffic_breakdown_body | column -t
}

Author: Nichole Kernreicht

Created: 2023-04-09 Sun 23:27